コード例 #1
0
class DFPowerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = PowerTransformer(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        # Reference: https://help.gooddata.com/doc/en/reporting-and-dashboards/maql-analytical-query-language/maql-expression-reference/aggregation-functions/statistical-functions/predictive-statistical-use-cases/normality-testing-skewness-and-kurtosis
        # Highly skewed:           -1   > Skewness > 1
        # Moderate skewed:         -0.5 < Skewness < -1
        #                           0.5 < Skewness < 1
        # Approximately symmetric: -0.5 < Skewness < 0.5
        skew_df = X[self.transform_cols].skew().to_frame(name='Skewness')
        # Normal distributed kurtosis: 3
        kurt_df = X[self.transform_cols].kurt().to_frame(name='Kurtosis')
        self.stat_df = skew_df.merge(kurt_df,
                                     left_index=True,
                                     right_index=True,
                                     how='left')

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.transform(
            X[self.transform_cols])

        # Transformed skewness & kurtosis
        skew_df = new_X[self.transform_cols].skew().to_frame(
            name='Skewness (Transformed)')
        kurt_df = new_X[self.transform_cols].kurt().to_frame(
            name='Kurtosis (Transformed)')
        stat_df = skew_df.merge(kurt_df,
                                left_index=True,
                                right_index=True,
                                how='left')
        self.stat_df = self.stat_df.merge(stat_df,
                                          left_index=True,
                                          right_index=True,
                                          how='left')

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.inverse_transform(
            X[self.transform_cols])

        return new_X
コード例 #2
0
#pip install hyperopt
from hyperopt import fmin, hp, tpe
from sklearn.model_selection import StratifiedKFold
nfolds = 5
skf = StratifiedKFold(n_splits=nfolds, shuffle=True)
acc = []

#https://github.com/BIMSBbioinfo/maui/blob/master/vignette/maui_vignette.ipynb
import maui
import maui.utils
print(f'Maui version: {maui.__version__}')
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
dfSILAClog2 = np.log2(dfSILAC + 1)  #really weird scaling
print(pt.fit(dfSILAClog2))
dfSILACtf = pt.transform(dfSILAClog2)
print(pt.lambdas_)
dfSILAClog2tf = maui.utils.scale(dfSILAClog2)
from keras import backend as K
import tensorflow as tf
#K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=12, inter_op_parallelism_threads=12)))
maui_model = maui.Maui(n_hidden=[1100], n_latent=70, epochs=400)
z = maui_model.fit_transform({'mRNA': dfSILAClog2tf})
maui_model.hist.plot()
maui_model.cluster(ami_y=z)

maui_model.kmeans_scores.plot()
import seaborn as sns
sns.clustermap(maui_model.z_)
コード例 #3
0
def grid_search(datFile, splitFile):
    # LOADING DATA FILE
    df = pd.read_csv(datFile, header=None)
    cols = ["z{}".format(x) for x in range(len(df.columns) - 2)]
    cols = cols + ["sample", "class"]
    df.columns = cols

    # LOADING TRAIN_TEST_VALIDATION SPLIT FILE
    split = pd.read_csv(splitFile)
    split = split.drop(["id", "synsetId", "subSynsetId"], axis=1)

    # SETTING SPLIT VARIABLES 1
    train = split.loc[split["split"] == "train"]
    test = split.loc[split["split"] == "test"]
    val = split.loc[split["split"] == "val"]

    # SETTING SPLIT VARIABLES 2
    train_set = df.loc[df["sample"].isin(train["modelId"])]
    test_set = df.loc[df["sample"].isin(test["modelId"])]
    val_set = df.loc[df["sample"].isin(val["modelId"])]

    # SETTING SPLIT VARIABLES 3
    X_train = train_set.drop(["sample", "class"], axis=1)
    y_train = train_set["class"]
    X_test = test_set.drop(["sample", "class"], axis=1)
    y_test = test_set["class"]
    X_val = val_set.drop(["sample", "class"], axis=1)
    y_val = val_set["class"]

    # STANDARDIZATION
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

    # SKEW REMOVAL
    pt = PowerTransformer(method="yeo-johnson", standardize=False)
    pt.fit(X_train)
    X_train = pt.transform(X_train)
    X_test = pt.transform(X_test)
    X_val = pt.transform(X_val)

    # TRAIN + VAL
    X_trainval = np.concatenate([X_train, X_val])
    y_trainval = pd.concat([y_train, y_val])
    prefold = [-1 for x in range(X_train.shape[0])] + \
           [0 for x in range(X_val.shape[0])]

    # GRIDSEARCHCV - KNN
    ps = PredefinedSplit(prefold)
    knn = KNN()

    param_grid = {
        "p": [1, 2],
        "n_neighbors": [5, 6, 7, 8, 9, 10],
        "weights": ["uniform", "distance"]
    }

    grid = GridSearchCV(knn, param_grid=param_grid, n_jobs=-1, cv=ps)
    grid.fit(X_trainval, y_trainval)
    print(grid.best_estimator_)
    print(grid.best_score_)
    print(grid.best_params_)

    # GRIDSEARCHCV - SVM
    ps = PredefinedSplit(prefold)
    svm = SVM()

    param_grid = {
        "C": [1.0, 2.0, 3.0, 4.0],
        "kernel": ["rbf", "poly"],
        "gamma": ["scale"],
    }

    grid = GridSearchCV(svm, param_grid=param_grid, n_jobs=-1, cv=ps)
    grid.fit(X_trainval, y_trainval)
    print(grid.best_estimator_)
    print(grid.best_score_)
    print(grid.best_params_)

    # GRIDSEARCHCV - RANDOM FOREST
    ps = PredefinedSplit(prefold)
    randforest = RandomForestClassifier()

    param_grid = {
        "n_estimators": [500, 600],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_features": ["auto"]
    }

    grid = GridSearchCV(randforest, param_grid=param_grid, n_jobs=-1, cv=ps)
    grid.fit(X_trainval, y_trainval)
    print(grid.best_estimator_)
    print(grid.best_score_)
    print(grid.best_params_)
コード例 #4
0
def yeo_johnson_target_transformer(self):
    yeo_johnson_target_transformer = PowerTransformer(method="yeo-johnson",
                                                      copy=True)
    yeo_johnson_target_transformer.fit(
        np.array(self.train_data[self.target]).reshape(-1, 1))
    return yeo_johnson_target_transformer
コード例 #5
0
    def prepare_data(self, df, look_back, freq_period, first=0,seq2seq=False):
        '''
        Parameters
        ----------
        df : DataFrame
            datafrmae contening historical data .
        look_back : int
            length entry of the model .
        Decompose the signal in three sub signals, trend,seasonal and residual in order to work separetly on each signal
        Returns
        -------
        trend_x : array
             values of the trend of the signal, matrix of dimention X array of dimension (1,length entry of model) X= length(dataframe)/look_back.
        trend_y : array
            vaklues to be predicted during the training
        seasonal_x : array
            same as trend_x but with the seasonal part of the signal.
        seasonal_y : array
            same as trend_y but with the seasonal part of the signal.
        residual_x : array
            same as trend_x but with the residual part of the signal.
        residual_y : array
            same as trend_y but with the residual part of the signal.
        '''
        self.seq2seq=seq2seq
        imputer = KNNImputer(n_neighbors=2, weights="uniform")
        df.loc[:,"y"]=imputer.fit_transform(np.array(df["y"]).reshape(-1, 1))
        if look_back%2==0:
            window=freq_period+1
        else:
            window=freq_period

        scalerfile = self.directory + '/scaler_pred.sav'
        if not os.path.isfile(scalerfile) or os.path.isfile(scalerfile) and first == 1:
            if (df["y"].max() - df["y"].min()) > 100:
                if self.verbose == 1:
                    print("PowerTransformation scaler used")
                scaler = PowerTransformer()
            else:
                if self.verbose == 1:
                    print("Identity scaler used")
                scaler = IdentityTransformer()
            self.scaler2 = scaler.fit(np.reshape(np.array(df["y"]), (-1, 1)))
            Y = self.scaler2.transform(np.reshape(np.array(df["y"]), (-1, 1)))
            pickle.dump(self.scaler2, open(scalerfile, 'wb'))
        elif os.path.isfile(scalerfile) and first == 0:
            self.scaler2 = pickle.load(open(scalerfile, "rb"))
            Y = self.scaler2.transform(np.reshape(np.array(df["y"]), (-1, 1)))
        if freq_period % 2 == 0:
            freq_period = freq_period + 1
        decomposition = STL(Y, period=freq_period)
        decomposition = decomposition.fit()
        df.loc[:, 'trend'] = decomposition.trend
        df.loc[:, 'seasonal'] = decomposition.seasonal
        df.loc[:, 'residual'] = decomposition.resid
        self.trend = np.asarray(df.loc[:, 'trend'])
        self.seasonal = np.asarray(df.loc[:, 'seasonal'])
        self.residual = np.asarray(df.loc[:, 'residual'])
        if not self.seq2seq :
            trend_x, trend_y = decoupe_dataframe(df["trend"], look_back)
            seasonal_x, seasonal_y = decoupe_dataframe(df["seasonal"], look_back)
            residual_x, residual_y = decoupe_dataframe(df["residual"], look_back)
        else :
            trend_x, trend_y = sequence_dataframe(df["trend"], look_back,self.len_pred)
            seasonal_x, seasonal_y = sequence_dataframe(df["seasonal"], look_back,self.len_pred)
            residual_x, residual_y = sequence_dataframe(df["residual"], look_back,self.len_pred)
        if self.verbose == 1:
            print("prepared")
        return trend_x, trend_y, seasonal_x, seasonal_y, residual_x, residual_y
# In[46]:


y=data[data["max_heart_rate achieved"]<85]
y


# DATA PREPARATION

# In[47]:


from sklearn.preprocessing import PowerTransformer
log=PowerTransformer()
log.fit(data[['st_deprssion']])
data['log_depression']=log.transform(data[['st_deprssion']])
data.drop('st_deprssion',inplace=True,axis=1)


# In[48]:


cnts_feature=['age','resting_blood_pressure','cholestoral','max_heart_rate achieved','log_depression']
cat_feature=[i for i in data.columns if i not in cnts_feature + ['target']]


# In[49]:


data=pd.get_dummies(data,columns=cat_feature)
コード例 #7
0
#3 RobustScaler
# the centering and scaling statistics of this scaler are based on percentiles
#and are therefore not influenced by a few number of very large marginal outliers.
scaler3 = RobustScaler()
scaler3.fit(X)
X3 = scaler3.transform(X)
df3 = pd.DataFrame(data=X3, columns=column_names)
print(df3.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df3, xlim=[-2, 3],
              ylim=[-2, 3])  #Range -2 to 3

#4 PowerTransformer
# applies a power transformation to each feature to make the data more Gaussian-like
scaler4 = PowerTransformer()
scaler4.fit(X)
X4 = scaler4.transform(X)
df4 = pd.DataFrame(data=X4, columns=column_names)
print(df4.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df4)  #

#5 QuantileTransformer
# has an additional output_distribution parameter allowing to match a
# Gaussian distribution instead of a uniform distribution.
scaler5 = QuantileTransformer()
scaler5.fit(X)
X5 = scaler5.transform(X)
df5 = pd.DataFrame(data=X5, columns=column_names)
print(df5.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df5)  #
コード例 #8
0
コメント:
baseline ver2.1作成後のEDA
"""

# ライブラリのインポート
from sklearn.preprocessing import PowerTransformer

# durationのヒストグラム
plt.hist(train['duration'], bins=50)
plt.title('duration')
plt.show()

# Yeo-Johnson変換
pt = PowerTransformer(method='yeo-johnson')
data = train['duration'].values.reshape(-1, 1)
pt.fit(data)
train['duration'] = pt.transform(data)

# Yeo-Johnson変換後のdurationのヒストグラム
plt.hist(train['duration'], bins=50)
plt.title('duration(Yeo-Johnson)')
plt.show()

# campaignのヒストグラム
plt.hist(train['campaign'], bins=50)
plt.title('campaign')
plt.show()

# Box-Cox変換
pt = PowerTransformer(method='box-cox')
data = train['campaign'].values.reshape(-1, 1)
コード例 #9
0
print(y[:10])
print('************************************')

print(np.max(x), np.min(x))  # 711.0 / 0.0
print(dataset.feature_names)
# print(dataset.DESCR)

# 데이터 전처리 (MinMaxScaler ; (x - min) / (max - min) -> 0 <= x' <= 1)

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = PowerTransformer(method='yeo-johnson')
# scaler = PowerTransformer(method='box-cox') # only be applied to strictly positive data
scaler.fit(x)
x = scaler.transform(x)

# Minmax
# print(np.max(x), np.min(x))   # 711.0 / 0.0  ->  1.0 / 0.0
# print(np.max(x[0]))           # 0.99999999999999999

#
print(np.max(x), np.min(x))  # 9.933930601860268 -3.9071933049810337
print(np.max(x[0]))  # 0.44105193260704206

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
コード例 #10
0
class DualNNRec(RecommenderBase, ABC):

    # TODO add support for Early Stopping
    def __init__(
        self,
        weight_decay: float = 0.0,
        lr: float = 2e-5,
        cap_length: int = 128,
        eps: float = 1e-8,
        num_warmup_steps: int = 0,
        epochs: int = 4,
        ffnn_params: dict = None,
    ):
        super().__init__()
        self.device = None
        self.device = self._find_device()
        self.weight_decay = weight_decay
        self.lr = lr
        self.eps = eps
        self.num_warmup_steps = num_warmup_steps
        self.epochs = epochs

        self.cap_length = cap_length

        self.scaler = PowerTransformer(method='yeo-johnson')

        self.ffnn_params = ffnn_params

        self.model = None

        # Set the seed value all over the place to make this reproducible.
        self.seed_val = 42
        random.seed(self.seed_val)
        np.random.seed(self.seed_val)
        torch.manual_seed(self.seed_val)

    @abstractmethod
    def _get_model(self, ffnn_input_size):
        pass

    def _find_device(self):
        # If there's a GPU available...
        if torch.cuda.is_available():

            # Tell PyTorch to use the GPU.
            device = torch.device("cuda")

            print('There are %d GPU(s) available.' % torch.cuda.device_count())

            print('We will use the GPU:', torch.cuda.get_device_name(0))

        # If not...
        else:
            print('No GPU available, using the CPU instead.')
            device = torch.device("cpu")

        return device

    def _normalize_features(self, df, is_train=False):
        if is_train == True:
            print("Fitting yeo-jhonson scaler")
            self.scaler.fit(df)
            # print(self.scaler.scale_, self.scaler.mean_, self.scaler.var_, self.scaler.n_samples_seen_)
        return pd.DataFrame(self.scaler.transform(df), columns=df.columns)

    def load_model(self):
        pass

    # TODO add support for cat features
    def fit(self,
            df_train_features: pd.DataFrame,
            df_train_tokens_reader: pd.io.parsers.TextFileReader,
            df_train_label: pd.DataFrame,
            df_val_features: pd.DataFrame,
            df_val_tokens_reader: pd.io.parsers.TextFileReader,
            df_val_label: pd.DataFrame,
            save_filename: str,
            cat_feature_set: set,
            normalize: bool = True,
            train_batches_to_skip: int = 0,
            val_batches_to_skip: int = 0,
            pretrained_model_dict_path: str = None,
            pretrained_optimizer_dict_path: str = None):

        self.df_train_label = df_train_label
        self.df_val_label = df_val_label

        print(df_train_features)
        print(df_val_features)

        assert len(
            df_train_label.columns) == 2, "it needs 2 labels in train df."

        assert len(df_val_label.columns) == 2, "it needs 2 labels in val df."

        assert len(df_train_features.columns) == len(df_val_features.columns), \
            "df_train_features and df_val_features have different number of columns"

        if normalize:
            df_train_features = self._normalize_features(df_train_features,
                                                         is_train=True)
            df_val_features = self._normalize_features(df_val_features)
            print(df_train_features)
            print(df_val_features)

        gpu = torch.cuda.is_available()
        if gpu:
            torch.cuda.manual_seed_all(self.seed_val)

        ffnn_input_size = HIDDEN_SIZE_BERT + df_train_features.shape[1]

        self.model = self._get_model(ffnn_input_size=ffnn_input_size)

        if pretrained_model_dict_path is not None:
            print(f"Loading pretrained model : {pretrained_model_dict_path}")
            self.model.load_state_dict(torch.load(pretrained_model_dict_path))

        if gpu:
            self.model.cuda()

        # freeze all bert layers
        # for param in self.model.bert.parameters():
        #     param.requires_grad = False
        train_dataset = CustomDatasetDualCap(
            df_features=df_train_features,
            df_tokens_reader=df_train_tokens_reader,
            df_label=df_train_label,
            cap=self.cap_length,
            batches_to_skip=train_batches_to_skip)
        val_dataset = CustomDatasetDualCap(
            df_features=df_val_features,
            df_tokens_reader=df_val_tokens_reader,
            df_label=df_val_label,
            cap=self.cap_length,
            batches_to_skip=val_batches_to_skip)

        train_dataloader, validation_dataloader = create_data_loaders(
            train_dataset,
            val_dataset,
            batch_size=df_train_tokens_reader.chunksize)

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params': [
                p for n, p in self.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
        # I believe the 'W' stands for 'Weight Decay fix"
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.
            lr,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
            eps=self.eps  # args.adam_epsilon  - default is 1e-8.
        )

        if pretrained_optimizer_dict_path is not None:
            print(
                f"Loading pretrained optimizer : {pretrained_optimizer_dict_path}"
            )
            optimizer.load_state_dict(
                torch.load(pretrained_optimizer_dict_path))

        # Total number of training steps is [number of batches] x [number of epochs].
        # (Note that this is not the same as the number of training samples).
        total_steps = len(train_dataloader) * self.epochs

        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,  # Default value in run_glue.py
            num_training_steps=total_steps)

        # We'll store a number of quantities such as training and validation loss,
        # validation accuracy, and timings.
        training_stats = []

        # Measure the total training time for the whole run.
        total_t0 = time.time()

        # For each epoch...
        for epoch_i in range(0, self.epochs):
            # ========================================
            #               Training
            # ========================================

            # Perform one full pass over the training set.

            print("")
            print('======== Epoch {:} / {:} ========'.format(
                epoch_i + 1, self.epochs))
            print('Training...')
            avg_train_loss, training_time, = self.train(
                self.model, train_dataloader, optimizer, scheduler)

            # ========================================
            #               Validation
            # ========================================
            # After the completion of each training epoch, measure our performance on
            # our validation set.

            print("")
            print("Running Validation...")

            avg_val_loss, validation_time = self.validation(
                model=self.model, validation_dataloader=validation_dataloader)

            # Record all statistics from this epoch.
            curr_stats = {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                # 'PRAUC train': prauc_train,
                # 'RCE train': rce_train,
                # 'PRAUC val': prauc_val,
                # 'RCE val': rce_val,
                'Valid. Loss': avg_val_loss,
                # 'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
            training_stats.append(curr_stats)

            pathlib.Path('./saved_models').mkdir(parents=True, exist_ok=True)

            model_path = f"./saved_models/saved_model_{save_filename}"
            optimizer_path = f"./saved_models/saved_optimizer_{save_filename}"

            print(f"Saving model : {model_path}")

            torch.save(self.model.state_dict(), model_path)
            torch.save(optimizer.state_dict(), optimizer_path)

            bot_string = f"DistilBertDoubleInput NN - dual_label \n ---------------- \n"
            bot_string = bot_string + str(self.model)
            bot_string = bot_string + "Weight decay: " + str(
                self.weight_decay) + "\n"
            bot_string = bot_string + "Learning rate: " + str(self.lr) + "\n"
            bot_string = bot_string + "Epsilon: " + str(
                self.eps) + "\n ---------------- \n"
            bot_string = bot_string + "\n".join(
                [key + ": " + str(curr_stats[key])
                 for key in curr_stats]) + "\n\n"
            bot_string = bot_string + "Saved to : " + model_path
            #telegram_bot_send_update(bot_string)

        print("")
        print("Training complete!")

        print("Total training took {:} (h:mm:ss)".format(
            format_time(time.time() - total_t0)))

        return training_stats

    def train(self, model, train_dataloader, optimizer, scheduler):

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0
        # total_train_prauc = 0
        # total_train_rce = 0

        # Put the model into training mode. Don't be mislead--the call to
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()
        preds_list = [None] * 2
        labels_list = [None] * 2

        # For each batch of training data...
        for step, batch in tqdm(enumerate(train_dataloader),
                                total=len(train_dataloader)):

            # Progress update every 40 batches.
            #if step % 40 == 0 and not step == 0:
            #    # Calculate elapsed time in minutes.
            #    elapsed = format_time(time.time() - t0)
            #
            #    # Report progress.
            #    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: features
            #   [3]: labels
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_features = batch[2].to(self.device)
            b_labels = batch[3].to(self.device)

            #print("b_labels")
            #print(b_labels)
            #print(b_labels.shape)

            # print("b_labels:",b_labels.shape)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            output_list = model(
                input_ids=b_input_ids,
                input_features=b_features,
                # token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels)

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            loss = output_list[0][0]
            total_train_loss += loss.item()

            for i in range(2):
                curr_preds = output_list[i][2]

                if preds_list[i] is None:
                    preds_list[i] = curr_preds
                else:
                    preds_list[i] = np.hstack([preds_list[i], curr_preds])

                curr_labels = b_labels.detach().cpu().numpy()[:, i]

                if labels_list[i] is None:
                    labels_list[i] = curr_labels
                else:
                    labels_list[i] = np.hstack([labels_list[i], curr_labels])

            # print(f"batch {step} RCE: {rce}")
            # print(f"batch {step} PRAUC: {prauc}")

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        print(f"TRAINING STATISTICS FOR EPOCH")
        for i in range(2):
            prauc, rce, conf, max_pred, min_pred, avg = self.evaluate(
                preds=preds_list[i], labels=labels_list[i])
            if i == 0:
                print("\n------- LABEL 1 -------")
            elif i == 1:
                print("\n------- LABEL 2 -------")

            print(f"PRAUC : {prauc}"
                  f"\nRCE : {rce}"
                  f"\nMIN : {min_pred}"
                  f"\nMAX : {max_pred}"
                  f"\nAVG : {avg}")

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

        return avg_train_loss, training_time

    def validation(self, model, validation_dataloader):

        t0 = time.time()
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables
        total_eval_loss = 0
        preds_list = [None] * 2
        labels_list = [None] * 2

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Evaluate data for one epoch
        for step, batch in tqdm(enumerate(validation_dataloader),
                                total=len(validation_dataloader)):

            # Progress update every 40 batches.
            #if step % 40 == 0 and not step == 0:
            #    # Calculate elapsed time in minutes.
            #    elapsed = format_time(time.time() - t0)
            #
            #    # Report progress.
            #    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(validation_dataloader), elapsed))

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: features
            #   [3]: labels
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_features = batch[2].to(self.device)
            b_labels = batch[3].to(self.device)
            # print("b_labels:",b_labels.shape)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                output_list = model(
                    input_ids=b_input_ids,
                    input_features=b_features,
                    attention_mask=b_input_mask,
                    labels=b_labels,
                )
            loss = output_list[0][0]
            total_eval_loss += loss.item()

            for i in range(2):
                curr_preds = output_list[i][2]

                if preds_list[i] is None:
                    preds_list[i] = curr_preds
                else:
                    preds_list[i] = np.hstack([preds_list[i], curr_preds])

                curr_labels = b_labels.detach().cpu().numpy()[:, i]

                if labels_list[i] is None:
                    labels_list[i] = curr_labels
                else:
                    labels_list[i] = np.hstack([labels_list[i], curr_labels])

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        print(f"VALIDATION STATISTICS FOR EPOCH")
        for i in range(2):
            prauc, rce, conf, max_pred, min_pred, avg = self.evaluate(
                preds=preds_list[i], labels=labels_list[i])
            if i == 0:
                print("\n------- LABEL 1 -------")
            elif i == 1:
                print("\n------- LABEL 2 -------")

            print(f"PRAUC : {prauc}"
                  f"\nRCE : {rce}"
                  f"\nMIN : {min_pred}"
                  f"\nMAX : {max_pred}"
                  f"\nAVG : {avg}")

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        return avg_val_loss, validation_time

    def evaluate(self, preds, labels=None):

        # print(preds)
        # print(preds.shape)
        # print(labels)
        # print(labels.shape)

        # Tries to load X and Y if not directly passed
        if (labels is None):
            print("No labels passed, cannot perform evaluation.")

        if (self.model is None):
            print("No model trained, cannot to perform evaluation.")

        else:
            #print("preds")
            #print(preds)
            #print(preds.shape)
            #print("labels")
            #print(labels)
            #print(labels.shape)

            # Declaring the class containing the metrics
            cm = CoMe(preds, labels)

            # Evaluating
            prauc = cm.compute_prauc()
            rce = cm.compute_rce()
            # Confusion matrix
            conf = cm.confMatrix()
            # Prediction stats
            max_pred, min_pred, avg = cm.computeStatistics()

            return prauc, rce, conf, max_pred, min_pred, avg

    def get_prediction(self,
                       df_test_features: pd.DataFrame,
                       df_test_tokens_reader: pd.io.parsers.TextFileReader,
                       pretrained_model_dict_path: str = None,
                       normalize: bool = True):

        if normalize:
            df_test_features = self._normalize_features(df_test_features)

        if pretrained_model_dict_path is None:
            assert self.model is not None, "You are trying to predict without training."
        else:
            ffnn_input_size = HIDDEN_SIZE_BERT + df_test_features.shape[1]
            self.model = self._get_model(ffnn_input_size=ffnn_input_size)
            self.model.load_state_dict(torch.load(pretrained_model_dict_path))

        self.model.cuda()
        self.model.eval()

        preds = None

        test_dataset = CustomTestDatasetCap(
            df_features=df_test_features,
            df_tokens_reader=df_test_tokens_reader,
            cap=self.cap_length)
        test_dataloader = DataLoader(
            test_dataset,  # The test samples.
            sampler=SequentialSampler(
                test_dataset),  # Select batches sequentially
            batch_size=df_test_tokens_reader.chunksize
            # Generates predictions with this batch size.
        )

        # Evaluate data for one epoch
        for step, batch in tqdm(enumerate(test_dataloader),
                                total=len(test_dataloader)):
            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: features
            #   [3]: labels
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_features = batch[2].to(self.device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                curr_logits = self.model(
                    input_ids=b_input_ids,
                    input_features=b_features,
                    # token_type_ids=None, --> missing in distilbert
                    attention_mask=b_input_mask)

            curr_logits = curr_logits[0]

            #print(curr_logits)
            #print(curr_logits.shape)

            curr_preds = torch.sigmoid(curr_logits)

            curr_preds = curr_preds.detach().cpu().numpy()

            if preds is None:
                preds = curr_preds
            else:
                preds = np.vstack([preds, curr_preds])

        return preds
コード例 #11
0
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=66)
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  random_state=66)

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, RobustScaler, PowerTransformer
# scaler = QuantileTransformer(n_quantiles=100)
scaler = PowerTransformer()
# scaler = RobustScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
x_val = scaler.transform(x_val)

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, BatchNormalization

model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(11, )))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
コード例 #12
0
                                   marginal_x="rug",
                                   marginal_y="histogram")
# >> fig_density.show()

# Show density heatmap for cities
fig_city = pltx.density_heatmap(data.head(30000),
                                x="ORIGIN",
                                y="DEST",
                                marginal_y="histogram")
# >> fig_city.show()

# Explore the skewness
skew = data.skew()
print('Skewness:', skew)

# Fix using a yj transformation
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = data.select_dtypes(include=numerics)

pt = PowerTransformer(method='yeo-johnson')
skewed_features = []
for feature, skew in skew.items():
    if skew >= 1.5 and feature in num_cols.columns.values and feature != 'YEAR':
        skewed_features.append(feature)

pt = PowerTransformer()
pt.fit(data[skewed_features])
data[skewed_features] = pt.transform(data[skewed_features])

print('Skewness after normalization:', data.skew())
コード例 #13
0
def chang_hug_map(X, hex_colors, FONT_SIZE=12, BINS=30):
    '''
    Function that applies Chang & Hug map of preprocessing data to a normal distribution:
    REF: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py
    
    Parameters:
    * X = features
    * hex_colors = hexadecimal colors to be used for each feature
    * FONT_SIZE = size of font on plots
    * BINS = number of bins on histogram plots
    '''
    # setting preprocessing methods: PowerTransformer (Box-Cox, Yeo-Johnson); QuantileTransformer
    scaler = MinMaxScaler(feature_range=(1, 2))
    boxcox = PowerTransformer(method='box-cox')
    bc = Pipeline(steps=[('s', scaler), ('bc', boxcox)])

    yj = PowerTransformer(method='yeo-johnson')

    rng = np.random.RandomState(304)
    qt = QuantileTransformer(n_quantiles=500,
                             output_distribution='normal',
                             random_state=rng)

    # adding distributions of columns
    distributions = []
    for i in range(0, len(X.columns)):
        name = X.columns[i]
        array = X[X.columns[i]].to_numpy().reshape(-1, 1)
        distributions.append((name, array))

    colors = hex_colors

    # generating the plot
    fig, axes = plt.subplots(
        nrows=12, ncols=15,
        figsize=(35, 25))  # cols = num of preprocessing methods + original
    axes = axes.flatten()
    axes_idxs = [
        (0, 15, 30, 45),
        (1, 16, 31, 46),
        (2, 17, 32, 47),
        (3, 18, 33, 48),
        (4, 19, 34, 49),
        (5, 20, 35, 50),  # first set
        (6, 21, 36, 51),
        (7, 22, 37, 52),
        (8, 23, 38, 53),
        (9, 24, 39, 54),
        (10, 25, 40, 55),
        (11, 26, 41, 56),
        (12, 27, 42, 57),
        (13, 28, 43, 58),
        (14, 29, 44, 59),
        (60, 75, 90, 105),
        (61, 76, 91, 106),
        (62, 77, 92, 107),
        (63, 78, 93, 108),
        (64, 79, 94, 109),
        (65, 80, 95, 110),  # second set
        (66, 81, 96, 111),
        (67, 82, 97, 112),
        (68, 83, 98, 113),
        (69, 84, 99, 114),
        (70, 85, 100, 115),
        (71, 86, 101, 116),
        (72, 87, 102, 117),
        (73, 88, 103, 118),
        (74, 89, 104, 119),
        (120, 135, 150, 165),
        (121, 136, 151, 166),
        (122, 137, 152, 167),
        (123, 138, 153, 168),
        (124, 139, 154, 169),
        (125, 140, 155, 170),
        (126, 141, 156, 171),
        (127, 142, 157, 172),
        (128, 143, 158, 173),
        (129, 144, 159, 174),
        (130, 145, 160, 175),
        (131, 146, 161, 176),
        (132, 147, 162, 177),
        (133, 148, 163, 178),
        (134, 149, 164, 179)
    ]

    axes_list = [(axes[i], axes[j], axes[k], axes[l])
                 for (i, j, k, l) in axes_idxs]

    for distribution, color, axes in zip(distributions, colors, axes_list):
        name, X_col = distribution
        X_train, X_test = train_test_split(X_col,
                                           test_size=0.2,
                                           random_state=rng)

        # perform power and quantile transforms
        X_trans_bc = bc.fit(X_train).transform(X_test)
        lmbda_bc = round(bc.named_steps['bc'].lambdas_[0], 2)
        X_trans_yj = yj.fit(X_train).transform(X_test)
        lmbda_yj = round(yj.lambdas_[0], 2)
        X_trans_qt = qt.fit(X_train).transform(X_test)

        ax_original, ax_bc, ax_yj, ax_qt = axes

        ax_original.hist(X_train, color=color, bins=BINS)
        ax_original.set_title(name, fontsize=FONT_SIZE)
        ax_original.tick_params(axis='both',
                                which='major',
                                labelsize=FONT_SIZE)

        for ax, X_trans, meth_name, lmbda in zip(
            (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt),
            ('Box-Cox', 'Yeo-Johnson', 'Quartile transform'),
            (lmbda_bc, lmbda_yj, None)):
            ax.hist(X_trans, color=color, bins=BINS)
            title = f'After {meth_name}'
            if lmbda is not None:
                title += f'\n$\lambda$ = {lmbda}'
            ax.set_title(title, fontsize=FONT_SIZE)
            ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
            ax.set_xlim([-3.5, 3.5])

    # Setting last plot as empty
    for i in range(-10, 0):
        ax_original, ax_bc, ax_yj, ax_qt = axes_list[i]
        ax_original.axis('off')
        ax_bc.axis('off')
        ax_yj.axis('off')
        ax_qt.axis('off')

    # Export and last adjustments
    plt.tight_layout()
    plt.savefig('fig/09_col_trf.png')
    plt.show()
class PreprocessData:
    def __init__(self,
                 preprocess_type=None,
                 extend_data=False,
                 short_end=False):

        self.config = Config()
        # prepare input data
        config_path = self.config.get_filepath("", "config.yaml")

        config_file = open(config_path, 'r')
        yaml_config = yaml.load(config_file, Loader=yaml.SafeLoader)

        self.training_dataset_names = [
            d['name'] for d in yaml_config['training_datasets']
        ]
        self.training_dataset_start_pos = [
            d['start_position'] for d in yaml_config['training_datasets']
        ]
        self.test_dataset_names = [
            d['name'] for d in yaml_config['test_datasets']
        ]
        self.test_dataset_start_pos = [
            d['start_position'] for d in yaml_config['test_datasets']
        ]
        self.dataset_names = np.concatenate(
            (self.training_dataset_names,
             self.test_dataset_names))  # do we need these?
        self.dataset_start_pos = np.concatenate(
            (self.training_dataset_start_pos,
             self.test_dataset_start_pos))  # do we need these?

        # read in all pickle files
        self.all_pd = []
        for dataset_name in self.dataset_names:
            self.all_pd.append(
                pd.read_pickle(self.config.get_filepath_data(dataset_name)))

        if extend_data:
            training_dataset_names_copy = np.array(self.training_dataset_names,
                                                   copy=True)

            # create a copy of the data shifted up by 10
            for i, dataset_name in enumerate(training_dataset_names_copy):
                self.dataset_names = np.append(self.dataset_names,
                                               dataset_name + "_" + str(10))
                self.training_dataset_names = np.append(
                    self.training_dataset_names, dataset_name + "_" + str(10))
                self.dataset_start_pos = np.append(
                    self.dataset_start_pos, self.training_dataset_start_pos[i])
                self.training_dataset_start_pos.append(
                    self.training_dataset_start_pos[i])
                self.all_pd.append(self.all_pd[i].copy() + 10)

        self.dict_datasets = dict(
            zip(self.dataset_names, np.arange(len(self.dataset_names))))

        self.enable_difference = False

        self._feature_range = [0, 1]
        self.normalisation_scalers = []
        for _ in self.dataset_names:
            self.normalisation_scalers.append(
                MinMaxScaler(feature_range=self.feature_range))

        self.enable_normalisation_scaler = False
        self.enable_ignore_price = False  # scale each curve to feature_range

        self.power_transformer = PowerTransformer()
        self.enable_power_transform = False

        self.standardisation_scalers = []
        for _ in self.dataset_names:
            self.standardisation_scalers.append(StandardScaler())

        self.enable_standardisation_scaler = False

        self.enable_log_returns = False
        self.mult_factor = 10  # 5
        self.add_factor = 25  # 6

        self.enable_log = False
        self.enable_pct_change = False

        self.enable_curve_smoothing = False

        self.short_end = short_end

        # now setup PreprocessType settings
        if preprocess_type is PreprocessType.NORMALISATION_OVER_TENORS:
            self.enable_normalisation_scaler = True
            self.feature_range = [0, 1]
        elif preprocess_type is PreprocessType.NORMALISATION_OVER_CURVES:
            self.enable_normalisation_scaler = True
            self.feature_range = [0, 1]
            self.enable_ignore_price = True
        elif preprocess_type is PreprocessType.STANDARDISATION_OVER_TENORS:
            self.enable_standardisation_scaler = True
        elif preprocess_type is PreprocessType.LOG_RETURNS_OVER_TENORS:
            self.enable_log_returns = True

    @property
    def feature_range(self):  # implements the get - this name is *the* name
        return self._feature_range

    @feature_range.setter
    def feature_range(self, value):  # name must be the same
        self._feature_range = value
        for i, _ in enumerate(self.dataset_names):
            self.normalisation_scalers[i] = MinMaxScaler(feature_range=value)

    def get_data(self,
                 training_dataset_names=None,
                 test_dataset_names=None,
                 chunks_of=None):

        if training_dataset_names is None:
            training_dataset_names = self.training_dataset_names
        if isinstance(training_dataset_names, str):
            training_dataset_names = np.array([training_dataset_names])

        if test_dataset_names is None:
            test_dataset_names = self.test_dataset_names
        if test_dataset_names is None and self.test_dataset_names is None:
            test_dataset_names = []

        if isinstance(test_dataset_names, str):
            test_dataset_names = np.array([test_dataset_names])

        training_data = []
        test_data = []
        training_data_scaled = []
        test_data_scaled = []
        for key, value in self.dict_datasets.items():
            start_position = self.dataset_start_pos[value]
            end_position = None
            if chunks_of is not None:
                end_position = chunks_of * (
                    (self.all_pd[value].shape[0] - start_position) //
                    chunks_of)

            if key in training_dataset_names:
                # we take the log returns of each data set and scale wrt first dataset
                new_training_data = self.all_pd[value].copy(
                )[start_position:end_position]
                if self.short_end:
                    new_training_data = new_training_data.iloc[:, 0]

                new_training_data_scaled = self.scale_data(
                    new_training_data, value, True)

                training_data.append(new_training_data)
                training_data_scaled.append(new_training_data_scaled)

            if key in test_dataset_names:
                new_test_data = self.all_pd[value].copy(
                )[start_position:end_position]
                if self.short_end:
                    new_test_data = new_test_data.iloc[:, 0]

                new_test_data_scaled = self.scale_data(
                    new_test_data, value,
                    True)  # todo: should we scale test data wrt training data?

                test_data.append(new_test_data)
                test_data_scaled.append(new_test_data_scaled)

        maturities = self.all_pd[0].columns.values / (30 * 12)  # for years

        if test_dataset_names is not None:
            return training_data, test_data, training_data_scaled, test_data_scaled, training_dataset_names, test_dataset_names, maturities
        else:
            return training_data_scaled, maturities

    # def rescale_data_inputter(self, data, datasets=None):
    #     rescaled_data = []
    #     if datasets == "train":
    #         for i, name in enumerate(self.training_dataset_names):
    #             # pos = self.dict_datasets[name]
    #             rescaled_data.append(self.rescale_data(data[i], dataset_name=name))
    #
    #     elif datasets == "test":
    #         for i, name in enumerate(self.test_dataset_names):
    #             # pos = self.dict_datasets[name]
    #             # self.scale_data(self, data, dataset_num=pos)
    #             rescaled_data.append(self.rescale_data(data[i], dataset_name=name))
    #
    #     return rescaled_data

    def scale_data(self, data, dataset_name=None, should_fit=False):

        # if given a numpy array, convert it to a dataframe first
        if type(data) is np.ndarray:
            _data = pd.DataFrame(data=data)
        elif isinstance(data, list):
            _data_list = []
            # if isinstance(dataset_name, list):
            for _data, _dataset_name in zip(data, dataset_name):
                _data_list.append(
                    self.scale_data(_data, _dataset_name, should_fit))
            # else:
            #     for _data in data:
            #         _data_list.append(self.scale_data(_data, should_fit, dataset_name))
            return _data_list
        else:
            _data = data.copy()

        time = _data.axes[0].tolist()
        # maturities = _data.columns.values

        dataset_num = 999
        if dataset_name is not None:
            if isinstance(dataset_name, numbers.Integral):
                dataset_num = dataset_name
            else:
                for key, value in self.dict_datasets.items():
                    if key == dataset_name:
                        dataset_num = value

        if self.enable_log:
            _data = _data.apply(np.log)

        if self.enable_difference:
            _data = _data.diff(axis=1)
            _data = _data.fillna(0)

        if self.enable_pct_change:
            _data = _data.pct_change()
            _data = _data.fillna(0)

        if self.enable_log_returns:
            shift = (_data.shift(0) + self.add_factor) / (
                _data.shift(1) + self.add_factor
            )  # add 6 to make it non-negative, to take the log later
            shift = shift.dropna()

            if not (np.array(shift) > 0).all():
                # some values are non-positive... this will break the log
                print("NON-POSITIVE VALUES FOUND, CANNOT PASS THROUGH LOG!!")
                print(np.min(_data))
                print(shift)

            _data = self.mult_factor * np.log(shift)

            time = _data.axes[0].tolist()

        # now use only numpy, convert pandas to numpy array
        _data = _data.values

        if self.short_end and len(_data.shape) == 1:
            _data = _data.reshape(-1, 1)

        if self.enable_standardisation_scaler:
            if not self.enable_ignore_price:
                if should_fit:
                    self.standardisation_scalers[dataset_num].fit(_data)
                _data = self.standardisation_scalers[dataset_num].transform(
                    _data)
            else:
                data_temp = []
                for row in _data:
                    # row_as_2d = row.reshape(1, -1)
                    row_as_column = row[:, np.newaxis]
                    self.standardisation_scalers[dataset_num].fit(
                        row_as_column)
                    temp = self.standardisation_scalers[dataset_num].transform(
                        row_as_column)
                    data_temp.append(temp.ravel())
                _data = np.array(data_temp)

        if self.enable_normalisation_scaler:
            if not self.enable_ignore_price:
                if should_fit:
                    self.normalisation_scalers[dataset_num].fit(_data)
                _data = self.normalisation_scalers[dataset_num].transform(
                    _data)
            else:
                data_temp = []
                for row in _data:
                    # row_as_2d = row.reshape(1, -1)
                    row_as_column = row[:, np.newaxis]
                    self.normalisation_scalers[dataset_num].fit(row_as_column)
                    temp = self.normalisation_scalers[dataset_num].transform(
                        row_as_column)
                    data_temp.append(temp.ravel())
                _data = np.array(data_temp)

        if self.enable_power_transform:
            if should_fit:
                self.power_transformer.fit(_data)
            _data = self.power_transformer.transform(_data)

        df = pd.DataFrame(data=_data, index=np.array(time))

        return df

    def rescale_data(self,
                     data,
                     dataset_name=None,
                     start_value=None,
                     index=None,
                     columns=None):

        if isinstance(data, pd.DataFrame):
            if columns is None:
                columns = data.columns.values
            if index is None:
                index = data.index.values

        if type(data) is np.ndarray:
            temp_data = data
        else:
            temp_data = np.array(data)

        if self.short_end and len(temp_data.shape) == 1:
            temp_data = temp_data.reshape(-1, 1)

        dataset_num = 999
        if dataset_name is not None:
            for key, value in self.dict_datasets.items():
                if key == dataset_name:
                    dataset_num = value

        if self.enable_difference:
            temp_data = temp_data  # TODO: inverse difference

        if self.enable_power_transform:
            temp_data = self.power_transformer.inverse_transform(temp_data)

        if self.enable_normalisation_scaler:

            # we need to scale each rolling window manually
            if self.enable_ignore_price:
                # rescale each curve individually
                data_min = self.all_pd[dataset_num].min(axis=1)
                data_max = self.all_pd[dataset_num].max(axis=1)
                a = self.feature_range[0]
                b = self.feature_range[1]
                for i in np.arange(temp_data.shape[0]):
                    temp_data[i] = (
                        (temp_data[i] - a) /
                        (b - a)) * (data_max[i] - data_min[i]) + data_min[i]
            else:
                if len(temp_data.shape) == 3:
                    new_temp_data = []
                    for i in np.arange(temp_data.shape[0]):
                        new_temp_data.append(
                            self.normalisation_scalers[dataset_num].
                            inverse_transform(temp_data[i]))
                    temp_data = np.array(new_temp_data)

                else:
                    temp_data = self.normalisation_scalers[
                        dataset_num].inverse_transform(temp_data)

        if self.enable_standardisation_scaler:
            # temp_data = self.standardisation_scaler.inverse_transform(temp_data)
            if self.enable_ignore_price:
                raise NotImplementedError
            else:
                if len(temp_data.shape) == 3:
                    new_temp_data = []
                    for i in np.arange(temp_data.shape[0]):
                        new_temp_data.append(
                            self.standardisation_scalers[dataset_num].
                            inverse_transform(temp_data[i]))
                    temp_data = np.array(new_temp_data)

                else:
                    temp_data = self.standardisation_scalers[
                        dataset_num].inverse_transform(temp_data)

        if self.enable_log:
            temp_data = np.exp(temp_data)

        if self.enable_log_returns:

            # if start_value is not assigned but dataset_name is, use the first value of the dataset as start_value
            if dataset_name is not None and start_value is None:
                _start_value = self.all_pd[dataset_num].iloc[0]
            elif start_value is not None:
                _start_value = start_value
            else:
                _start_value = 1.

            # print("shapes, log-return rescale", temp_data.shape, _start_value.shape, _start_value[0].shape)

            if len(temp_data.shape) is 1:
                z = np.exp(temp_data / self.mult_factor)

                z = np.insert(
                    np.array(z), 0, _start_value[0] +
                    self.add_factor)  # instead of the usual _start_value
                temp_data = np.cumprod(z) - self.add_factor
                temp_data = pd.DataFrame(data=temp_data,
                                         index=self.all_pd[dataset_num].index)
                # print(temp_data.head(10))
            elif len(
                    temp_data.shape
            ) is 2:  # when taking log-returns on an individual batch, todo: check

                if self.short_end:
                    z = np.exp(temp_data / self.mult_factor)
                    z = np.insert(z,
                                  0,
                                  _start_value[0] + self.add_factor,
                                  axis=0)
                    temp_data = np.cumprod(z, axis=0) - self.add_factor
                else:
                    z = np.exp(temp_data / self.mult_factor)
                    z = np.insert(z, 0, _start_value + self.add_factor, axis=0)
                    temp_data = np.cumprod(z, axis=0) - self.add_factor

            elif len(temp_data.shape
                     ) > 2:  # when taking log-returns on multiple batches
                z = np.exp(temp_data[:, :] / self.mult_factor)
                z = np.insert(z, 0, _start_value + self.add_factor, axis=1)
                temp_data = np.cumprod(z, axis=1) - self.add_factor
            else:
                z = np.exp(temp_data[0, :] / self.mult_factor)
                z = np.insert(z, 0, _start_value + self.add_factor)
                temp_data = np.cumprod(z) - self.add_factor

            # print("log returns undo...", _start_value, temp_data[0])

        if self.enable_curve_smoothing:
            curve_smooth = []

            for curve in temp_data:
                curve_smooth.append(savgol_filter(
                    curve, 23, 5))  # window size 51, polynomial order 3
            temp_data = np.array(curve_smooth)

        if index is not None and columns is not None:
            return pd.DataFrame(temp_data, index=index, columns=columns)
        else:
            return temp_data
コード例 #15
0


#Log transformation

#In the previous exercises you scaled the data linearly, which will not affect the data's shape. This works great if your data is normally distributed (or closely normally distributed), an assumption that a lot of machine learning models make. Sometimes you will work with data that closely conforms to normality, e.g the height or weight of a population. On the other hand, many variables in the real world do not follow this pattern e.g, wages or age of a population. In this exercise you will use a log transform on the ConvertedSalary column in the so_numeric_df DataFrame as it has a large amount of its data centered around the lower values, but contains very high values also. These distributions are said to have a long right tail.


# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer

# Instantiate PowerTransformer
pow_trans = PowerTransformer()

# Train the transform on the data
pow_trans.fit(so_numeric_df[['ConvertedSalary']])

# Apply the power transform to the data
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])

# Plot the data before and after the transformation
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
plt.show()



#Percentage based outlier removal

#One way to ensure a small portion of data is not having an overly adverse effect is by removing a certain percentage of the largest and/or smallest values in the column. This can be achieved by finding the relevant quantile and trimming the data using it with a mask. This approach is particularly useful if you are concerned that the highest values in your dataset should be avoided. When using this approach, you must remember that even if there are no outliers, this will still remove the same top N percentage from the dataset.

# Find the 95th quantile
コード例 #16
0
    del df_iterator
    gc.collect()

    return ret


##############################Train########################################
train_path = "/data/recsys2020/history_nn/TrainXGB.csv"
train_dict = generate_dict_np(train_path)

##Fit scalers
scaler_f = PowerTransformer(copy=False)
start_time = time.time()
s = len(train_dict['features'])
scaler_f.fit(train_dict['features'][np.random.choice(s, int(0.1 * s))].astype(
    np.float64, copy=False))
print("Elapsed: {0}".format(inhour(time.time() - start_time)))
print("fit feature scaler")
##Save scalers
with open('/data/recsys2020/history_nn/f_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_f, f, protocol=4)
##Fit scalers

## Load scalers
# with open('/data/recsys2020/history_nn/f_scaler.pkl', 'rb') as f:
#    scaler_f = pickle.load(f)
## Load scalers

##Apply scalers to train set
start_time = time.time()
train_dict['features'] = scaler_f.transform(train_dict['features'])
コード例 #17
0
    'darkorchid'
]

fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2))
axes = axes.flatten()
axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),
             (13, 16, 19, 22), (14, 17, 20, 23)]
axes_list = [(axes[i], axes[j], axes[k], axes[l])
             for (i, j, k, l) in axes_idxs]

for distribution, color, axes in zip(distributions, colors, axes_list):
    name, X = distribution
    X_train, X_test = train_test_split(X, test_size=.5)

    # perform power transforms and quantile transform
    X_trans_bc = bc.fit(X_train).transform(X_test)
    lmbda_bc = round(bc.lambdas_[0], 2)
    X_trans_yj = yj.fit(X_train).transform(X_test)
    lmbda_yj = round(yj.lambdas_[0], 2)
    X_trans_qt = qt.fit(X_train).transform(X_test)

    ax_original, ax_bc, ax_yj, ax_qt = axes

    ax_original.hist(X_train, color=color, bins=BINS)
    ax_original.set_title(name, fontsize=FONT_SIZE)
    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)

    for ax, X_trans, meth_name, lmbda in zip(
        (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt),
        ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
        (lmbda_bc, lmbda_yj, None)):
コード例 #18
0
def get_data(
        ssX=None,
        batch_size=32,
        train=True,
        **kwargs):
    """
    inputs:
        batch_size: int

    return:
        (dataloader, test_dataloader)
    """
    plot_random = False if 'plot_random' not in kwargs else kwargs['plot_random']
    plot_resonant = not plot_random
    train_all = False if 'train_all' not in kwargs else kwargs['train_all']
    plot = False if 'plot' not in kwargs else kwargs['plot']
    if not train_all and ssX is None:
        plot_resonant = True
        plot_random = False

    if train_all:
        filename = 'data/combined.pkl'
    elif plot_resonant:
        filename = 'data/resonant_dataset.pkl'
    elif plot_random:
        filename = 'data/random_dataset.pkl'

    # These are generated by data_from_pkl.py
    loaded_data = pkl.load(
        open(filename, 'rb')
    )

    train_ssX = (ssX is None)

    fullX, fully = loaded_data['X'], loaded_data['y']

    if train_all:
        len_random = 17082 #Number of valid random examples (others have NaNs)
        random_data = np.arange(len(fullX)) >= (len(fullX) - len_random)


    # Differentiate megno
    if 'fix_megno' in kwargs and kwargs['fix_megno']:
        idx = [i for i, lab in enumerate(loaded_data['labels']) if 'megno' in lab][0]
        fullX[:, 1:, idx] -= fullX[:, :-1, idx]

    if 'include_derivatives' in kwargs and kwargs['include_derivatives']:
        derivative = fullX[:, 1:, :] - fullX[:, :-1, :]
        derivative = np.concatenate((
            derivative[:, [0], :],
            derivative), axis=1)
        fullX = np.concatenate((
            fullX, derivative),
            axis=2)


    # Hide fraction of test
    # MAKE SURE WE DO COPIES AFTER!!!!
    if train:
        if train_all:
            remy, finaly, remX, finalX, rem_random, final_random = train_test_split(fully, fullX, random_data, shuffle=True, test_size=1./10, random_state=0)
            trainy, testy, trainX, testX, train_random, test_random = train_test_split(remy, remX, rem_random, shuffle=True, test_size=1./10, random_state=1)
        else:
            remy, finaly, remX, finalX = train_test_split(fully, fullX, shuffle=True, test_size=1./10, random_state=0)
            trainy, testy, trainX, testX = train_test_split(remy, remX, shuffle=True, test_size=1./10, random_state=1)
    else:
        assert not train_all
        remy = fully
        finaly = fully
        testy = fully
        trainy = fully
        remX = fullX
        finalX = fullX
        testX = fullX
        trainX = fullX

    if plot:
        # Use test dataset for plotting, so put it in validation part:
        testX = finalX
        testy = finaly

    if train_ssX:
        if 'power_transform' in kwargs and kwargs['power_transform']:
            ssX = PowerTransformer(method='yeo-johnson') #Power is best
        else:
            ssX = StandardScaler() #Power is best

    n_t = trainX.shape[1]
    n_features = trainX.shape[2]

    if train_ssX:
        ssX.fit(trainX.reshape(-1, n_features)[::1539])

    ttrainy = trainy
    ttesty = testy
    ttrainX = ssX.transform(trainX.reshape(-1, n_features)).reshape(-1, n_t, n_features)
    ttestX = ssX.transform(testX.reshape(-1, n_features)).reshape(-1, n_t, n_features)
    if train_all:
        ttest_random = test_random
        ttrain_random = train_random

    tremX = ssX.transform(remX.reshape(-1, n_features)).reshape(-1, n_t, n_features)
    tremy = remy

    train_len = ttrainX.shape[0]
    X = Variable(torch.from_numpy(np.concatenate((ttrainX, ttestX))).type(torch.FloatTensor))
    y = Variable(torch.from_numpy(np.concatenate((ttrainy, ttesty))).type(torch.FloatTensor))
    if train_all:
        r = Variable(torch.from_numpy(np.concatenate((ttrain_random, ttest_random))).type(torch.BoolTensor))

    Xrem = Variable(torch.from_numpy(tremX).type(torch.FloatTensor))
    yrem = Variable(torch.from_numpy(tremy).type(torch.FloatTensor))

    idxes = np.s_[:]
    dataset = torch.utils.data.TensorDataset(X[:train_len, :, idxes], y[:train_len])
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=8)

    # Cut up dataset into only the random or resonant parts. 
    # Only needed if plotting OR 
    if (not plot) or (not train_all):
        test_dataset = torch.utils.data.TensorDataset(X[train_len:, :, idxes], y[train_len:])
    else:
        if plot_random: mask =  r
        else:           mask = ~r
        print(f'Plotting with {mask.sum()} total elements, when plot_random={plot_random}')
        test_dataset = torch.utils.data.TensorDataset(X[train_len:][r[train_len:]][:, :, idxes], y[train_len:][r[train_len:]])

    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=3000, shuffle=False, pin_memory=True, num_workers=8)
        
    kwargs['model'].ssX = copy(ssX)

    return dataloader, test_dataloader
コード例 #19
0
# With the increase in income, ccavg also increases, and people tend to take more loans.
sns.scatterplot(x='CCAvg',y='Income',hue = 'PersonalLoan',data = df)

# NO CORRELATION BETWEEN A CUSTOMER USING INTERNET BANKING FACILITIES AND TAKING A PERSONAL LOAN.
sns.countplot(x='Online',hue='PersonalLoan',data=df)

sns.boxplot(x='PersonalLoan',y='CCAvg',data=df)

"""# **NECESSARY TRANSFORMATIONS FOR FEATURE VARIABLES**"""

y=df['PersonalLoan']
x=df.drop(['PersonalLoan'],axis=1)

from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method = "yeo-johnson", standardize = False)
pt.fit(x['Income'].values.reshape(-1,1))
x['Income'] = pt.transform(x['Income'].values.reshape(-1,1))
sns.distplot(x.Income)

pt = PowerTransformer(method = "yeo-johnson", standardize = False)
pt.fit(x['CCAvg'].values.reshape(-1,1))
x['CCAvg'] = pt.transform(x['CCAvg'].values.reshape(-1,1))
sns.distplot(x.CCAvg)

x['Mortgage_Int'] = pd.cut(x['Mortgage'],
bins = [0,100,200,300,400,500,600,700],
labels = [0,1,2,3,4,5,6],
include_lowest = True)
x.drop('Mortgage',axis = 1, inplace = True)
sns.distplot(x.Mortgage_Int)
コード例 #20
0
sns.reset_defaults()
#sns.set_style('whitegrid')
#sns.set_context('talk')
sns.set_context(context='talk', font_scale=0.7)

tfd = tfp.distributions

nametrain = '/Users/aklimase/Documents/USGS/data/cybertrainyeti10_residfeb.csv'
nametest = '/Users/aklimase/Documents/USGS/data/cybertestyeti10_residfeb.csv'
train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata(
    nametrain, nametest, n=12)

#%%
#preprocessing transform inputs data to be guassian shaped
pt = PowerTransformer()
aa = pt.fit(train_data1[:, :])
train_data = aa.transform(train_data1)
test_data = aa.transform(test_data1)

train_targets = train_targets1[0:5000]
test_targets = test_targets1[0:5000]

y_test = test_targets1.T[0:1]
y_train = train_targets1.T[0:1]

x_range = [[min(train_data.T[i]) for i in range(len(train_data[0]))],
           [max(train_data.T[i]) for i in range(len(train_data[0]))]]

x_train = train_data[0:5000]
x_test = test_data[0:5000]
# Transform the data using the fitted scaler
so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])

# Compare the origional and transformed column
print(so_numeric_df[['Age_SS', 'Age']].head())

## Log transformation

# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer

# Instantiate PowerTransformer
pow_trans = PowerTransformer()

# Train the transform on the data
pow_trans.fit(so_numeric_df[["ConvertedSalary"]])

# Apply the power transform to the data
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(
    so_numeric_df[['ConvertedSalary']])

# Plot the data before and after the transformation
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
plt.show()

### Removing outliers

## Percentage based outlier removal

# Find the 95th quantile
quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)
コード例 #22
0
def single_results(datFiles, splitFile):
    # LOADING TRAIN_TEST_VALIDATION SPLIT FILE
    split = pd.read_csv(splitFile)
    split = split.drop(["id", "synsetId", "subSynsetId"], axis=1)

    # SETTING SPLIT VARIABLES
    train = split.loc[split["split"] == "train"]
    test = split.loc[split["split"] == "test"]
    val = split.loc[split["split"] == "val"]

    for datFile in datFiles:
        # LOADING DATA FILE
        df = pd.read_csv(datFile, header=None)
        n_features = len(df.columns) - 2

        feats = ["z{}".format(x) for x in range(n_features)]
        cols = feats + ["sample", "class"]
        df.columns = cols

        # SPLITIN SETS
        train_set = df.loc[df["sample"].isin(train["modelId"])]
        test_set = df.loc[df["sample"].isin(test["modelId"])]
        val_set = df.loc[df["sample"].isin(val["modelId"])]

        X_train = train_set.drop(["sample", "class"], axis=1)
        y_train = train_set["class"]
        X_test = test_set.drop(["sample", "class"], axis=1)
        y_test = test_set["class"]
        X_val = val_set.drop(["sample", "class"], axis=1)
        y_val = val_set["class"]

        # REMOVE ZERO VARIANCE
        selector = VarianceThreshold()
        X_train = selector.fit_transform(X_train)
        X_test = selector.fit_transform(X_test)
        X_val = selector.fit_transform(X_val)

        # STANDARDIZATION
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        X_val = scaler.transform(X_val)

        # SKEW REMOVAL
        pt = PowerTransformer(method="yeo-johnson", standardize=False)
        pt.fit(X_train)
        X_train = pt.transform(X_train)
        X_test = pt.transform(X_test)
        X_val = pt.transform(X_val)

        # CLASSIFIERS
        classifiers = {
            "kNN": KNN(n_neighbors=8, weights="distance"),
            "SVM": SVM(C=3, gamma="scale", kernel="rbf"),
            "RFC": RandomForest(n_estimators=500),
        }

        ans = {
            "classifier": [],
            "accuracy": [],
        }

        for name, classifier in classifiers.items():
            # CLASSIFICATION
            classifier.fit(X_train, y_train)
            accuracy = classifier.score(X_test, y_test)

            accuracy = round(100 * accuracy, 2)

            ans["classifier"].append(name)
            ans["accuracy"].append(accuracy)

            print("{}\t{}\t{}".format(datFile, name, accuracy))

        ans = pd.DataFrame(ans)
        ans.to_csv(datFile.replace(".dat", "_ans.csv"), index=None)

        del ans
        del classifiers
コード例 #23
0
def train():

    seed = 0

    df = pd.read_csv('listings.csv')

    train, test = train_test_split(df,
                                   test_size=0.2,
                                   random_state=seed,
                                   shuffle=True)

    # Drop unnecessary columns
    train = train[[
        'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights',
        'price'
    ]]
    test = test[[
        'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights',
        'price'
    ]]

    # Power Transform
    X_train = train.drop(['price'], axis=1)
    y_train = train['price'].values

    X_test = test.drop(['price'], axis=1)
    y_test = test['price'].values

    num_cols = X_train._get_numeric_data().columns.tolist()

    pt = PowerTransformer(method='yeo-johnson')

    X_train[num_cols] = pt.fit_transform(X_train[num_cols])
    X_test[num_cols] = pt.transform(X_test[num_cols])

    # saving transformer first
    joblib.dump(pt.fit(y_train.reshape(-1, 1)), 'powerTransform.joblib')

    y_train = pt.fit_transform(y_train.reshape(-1, 1))
    y_test = pt.transform(y_test.reshape(-1, 1))

    # Label Encoder
    le = LabelEncoder()

    cat_cols_train = X_train.select_dtypes(
        include=['string', 'object']).columns.tolist()

    cat_cols_test = X_test.select_dtypes(
        include=['string', 'object']).columns.tolist()

    for col in cat_cols_train:

        joblib.dump(le.fit(X_train[col].astype('string')),
                    'le_{}.joblib'.format(col))

        X_train[col] = le.fit_transform(X_train[col].astype('string'))

    # I fit the test dataset because it contains previously unseen labels in the train dataset
    for col in cat_cols_test:
        X_test[col] = le.fit_transform(X_test[col].astype('string'))

    # Outliers
    X_train['price'] = y_train.ravel().tolist()

    X_train.drop(X_train[(X_train['price'] < -4)].index, inplace=True)

    y_train = X_train['price']

    X_train.drop('price', axis=1, inplace=True)

    # Model
    X_train = X_train.values

    y_train = y_train.values

    model = LGBMRegressor(max_depth=10, num_leaves=20, random_state=0)

    model.fit(X_train, y_train)

    joblib.dump(model, "model.joblib")
コード例 #24
0
def yeo_johnson_transformer(self):
    yeo_johnson_transformer = PowerTransformer(method="yeo-johnson", copy=True)
    yeo_johnson_transformer.fit(self.train_imputed_numeric_df)
    return yeo_johnson_transformer
コード例 #25
0
data.dtypes

# astype dataset
data.REG_YYMM = data.REG_YYMM.astype('category')
data.CARD_SIDO_NM = data.CARD_SIDO_NM.astype('category')
data.CARD_CCG_NM = data.CARD_CCG_NM.astype('category')
data.STD_CLSS_NM = data.STD_CLSS_NM.astype('category')
data.HOM_SIDO_NM = data.HOM_SIDO_NM.astype('category')
data.HOM_CCG_NM = data.HOM_CCG_NM.astype('category')
data.AGE = data.AGE.astype('category')
data.SEX_CTGO_CD = data.SEX_CTGO_CD.astype('category')
data.FLC = data.FLC.astype('category')

# Transformation
pt = PowerTransformer(method='box-cox', standardize=False)
pt.fit(data.iloc[:, 9:12])
pt_int_data = pt.transform(data.iloc[:, 9:12])
pt.lambda_

# Group by
category_data = pd.DataFrame(data.iloc[:, :9])
pt_int_data = pd.DataFrame(pt_int_data, columns=data.columns[9:12])
pt_data = pd.concat([category_data, pt_int_data], axis=1)
pt_data = pt_data.sort_values(by='REG_YYMM')

groupby_pt = pt_data.groupby(list(data.columns), observed=True)
sum_groupby_pt = groupby_pt.sum()

pt_data.REG_YYMM.value_counts()

# Shaping
コード例 #26
0
ファイル: NN_spherical.py プロジェクト: jjp4595/NN_charges
# X = np.column_stack((lol, X[:,2]))

# # difference data
# y_gauss = JP_highZ(X[:,0], (X[:, 1] / (X[:,0]**(1/3)) ), X[:,2]) *1000
# y_gauss = y_gauss.reshape(len(y_gauss),1)
# y = y_gauss - y_og

#Scaling X
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
scaler_x = scaler.fit(X)

#scaling y
scaler2 = PowerTransformer()
#scaler2 = MinMaxScaler(feature_range=(0,1))
scaler_y = scaler2.fit(y)
y_scaled = scaler_y.transform(y)

scaler_y2 = scaler.fit(y_scaled)
y_scaled = scaler_y2.transform(y_scaled)


# create model
def baseline_model():
    model = Sequential()
    model.add(
        Dense(200,
              input_dim=2,
              kernel_initializer='he_uniform',
              activation='relu'))
    #model.add(Dropout(0.2))
コード例 #27
0
ファイル: transform.py プロジェクト: wuzunzun/XenonPy
class PowerTransformer(BaseEstimator, TransformerMixin):
    """
    Box-cox transform.
    References
    ----------
    G.E.P. Box and D.R. Cox, “An Analysis of Transformations”,
    Journal of the Royal Statistical Society B, 26, 211-252 (1964).
    """
    def __init__(self,
                 *,
                 method='yeo-johnson',
                 standardize=False,
                 lmd=None,
                 tolerance=(-np.inf, np.inf),
                 on_err=None):
        """

        Parameters
        ----------
        method: 'yeo-johnson' or 'box-cox'
            ‘yeo-johnson’ works with positive and negative values
            ‘box-cox’ only works with strictly positive values
        standardize: boolean
            Normalize to standard normal or not.
            Recommend using a sepearate `standard` function instead of using this option.
        lmd: list or 1-dim ndarray
            You might assign each input xs with a specific lmd yourself.
            Leave None(default) to use a inferred value.
            See `PowerTransformer` for detials.
        tolerance: tuple
            Tolerance of lmd. Set None to accept any.
            Default is **(-np.inf, np.inf)** but recommend **(-2, 2)** for Box-cox transform
        on_err: None or str
            Error handle when try to inference lambda. Can be None or **log**, **nan** or **raise** by string.
            **log** will return the logarithmic transform of xs that have a min shift to 1.
            **nan** return ``ndarray`` with shape xs.shape filled with``np.nan``.
            **raise** raise a FloatingPointError. You can catch it yourself.
            Default(None) will return the input series without scale transform.
        .. _PowerTransformer:
            https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer
        """
        self._tolerance = tolerance
        self._pt = PT(method=method, standardize=standardize)
        self._lmd = lmd
        self._shape = None
        self._on_err = on_err

    def _check_type(self, x):
        if isinstance(x, list):
            x = np.array(x, dtype=np.float)
        elif isinstance(x, (DataFrame, Series)):
            x = x.values
        if not isinstance(x, np.ndarray):
            raise TypeError(
                'parameter `X` should be a `DataFrame`, `Series`, `ndarray` or list object '
                'but got {}'.format(type(x)))
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        return x

    def fit(self, x):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data used to compute the per-feature transformation

        Returns
        -------
        self : object
            Fitted scaler.
        """

        x = self._pt._check_input(self._check_type(x), in_fit=True)

        # forcing constant column vectors to have no transformation (lambda=1)
        idx = []
        for i, col in enumerate(x.T):
            if np.all(col == col[0]):
                idx.append(i)

        if self._lmd is not None:
            if isinstance(self._lmd, float):
                self._pt.lambdas_ = np.array([self._lmd] * x.shape[1])
            elif x.shape[1] != len(self._lmd):
                raise ValueError(
                    'shape[1] of parameter `X` should be {} but got {}'.format(
                        x.shape[1], len(self._lmd)))
            else:
                self._pt.lambdas_ = np.array(self._lmd)
        else:
            self._pt.fit(x)

        if len(idx) > 0:
            self._pt.lambdas_[idx] = 1.

        return self

    def transform(self, x):
        ret = self._pt.transform(self._check_type(x))
        if isinstance(x, pd.DataFrame):
            return pd.DataFrame(ret, index=x.index, columns=x.columns)
        return ret

    def inverse_transform(self, x):
        ret = self._pt.inverse_transform(self._check_type(x))
        if isinstance(x, pd.DataFrame):
            return pd.DataFrame(ret, index=x.index, columns=x.columns)
        return ret
コード例 #28
0
def get_outliers(
        data, STD_NORM, side, METHOD='yeo-johnson',
        PLOT=False, title=None, title_fontsize=None,
        x_label=None, y_label=None, label_fontsize=None
):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.preprocessing import PowerTransformer
    from statsmodels.graphics.gofplots import qqplot
    import colourPals as cp
    import importlib
    importlib.reload(cp)
    # ==================================================
    # Error checking
    assert side == 'left' or side == 'right', "'side' argument has to be either 'left' or 'right'"
    # ==================================================
    # If minimum text is less than zero, and 'box-cox' is selected, compute constant k to shift the text cos that the transformation can be performed.
    if METHOD == 'box-cox' and min(data) <= 0:
        k = 1 - min(data)
        data = data + k

    # ----- Transform text
    pt = PowerTransformer(method=METHOD)
    # Find optimal lambda value for transform
    pt.fit(data.to_numpy().reshape(-1, 1))
    # Transform text to a normal distribution
    data_trans = pt.transform(data.to_numpy().reshape(-1, 1))

    # ----- Compute threshold to remove text above or below threshold
    data_trans_thres = data_trans.mean() + STD_NORM*data_trans.std()
    # Transform threshold back to original distribution
    data_thres = pt.inverse_transform(np.array(data_trans_thres).reshape(1, -1))
    data_thres = data_thres.flatten()[0]

    # If text was shifted before, shift the text back by the same constant.
    if 'k' in locals():
        data_thres = data_thres - k
        data = data - k

    # If normalised standard deviation is less than 0, remove negative end of the text.
    # If normalised standard deviation is more than or equal to 0, remove positive end of the text.
    if side == 'left':
        outliers = data[data < data_thres]
    elif side == 'right':
        outliers = data[data > data_thres]
    else:
        raise ValueError("Argument side has to be 'left'or 'right' ")

    # Flatten can covert transformed text to a series
    data_trans = pd.Series(data_trans.flatten())

    if PLOT:
        FIG_SIZE = 3
        sns.set_style("darkgrid")
        sns.set_context("notebook")
        fig, ax = plt.subplots(nrows=3, figsize=(FIG_SIZE*2, FIG_SIZE*3), dpi=300)

        # Plot coeffMax before transformation
        sns.distplot(data, rug=True, kde=False, ax=ax[0], color=cp.cbPaired['blue'])
        ax[0].axvline(x=data_thres, c=cp.cbPaired['red'])
        ax[0].set_title(title, fontsize=title_fontsize)
        ax[0].set_xlabel(x_label, fontsize=label_fontsize)
        ax[0].set_ylabel(f"Frequency", fontsize=label_fontsize)

        # Plot coeffMax after transformation
        sns.distplot(data_trans, rug=True, kde=False, ax=ax[1], color=cp.cbPaired['purple'])
        ax[1].axvline(x=data_trans_thres, c=cp.cbPaired['red'])
        ax[1].set_xlabel(f"{METHOD.capitalize()} Transformed", fontsize=label_fontsize)
        ax[1].set_ylabel(f"Frequency", fontsize=label_fontsize)

        # Plot qqplot of coeffMax after transformation
        qqplot(data_trans, ax=ax[2], line='s', color=cp.cbPaired['purple'])

        plt.tight_layout()
        plt.show()

    return outliers, data_thres
コード例 #29
0
def load_data_from_folder(
    folder_path,
    text_cols,
    tokenizer,
    label_col,
    label_list=None,
    categorical_cols=None,
    numerical_cols=None,
    sep_text_token_str=' ',
    categorical_encode_type='ohe',
    numerical_transformer_method='quantile_normal',
    empty_text_values=None,
    replace_empty_text=None,
    max_token_length=None,
    debug=False,
):
    """
    Function to load tabular and text data from a specified folder

    Loads train, test and/or validation text and tabular data from specified
    folder path into TorchTextDataset class and does categorical and numerical
    data preprocessing if specified. Inside the folder, there is expected to be
    a train.csv, and test.csv (and if given val.csv) containing the training, testing,
    and validation sets respectively

    Args:
        folder_path (str): The path to the folder containing `train.csv`, and `test.csv` (and if given `val.csv`)
        text_cols (:obj:`list` of :obj:`str`): The column names in the dataset that contain text
            from which we want to load
        tokenizer (:obj:`transformers.tokenization_utils.PreTrainedTokenizer`):
            HuggingFace tokenizer used to tokenize the input texts as specifed by text_cols
        label_col (str): The column name of the label, for classification the column should have
            int values from 0 to n_classes-1 as the label for each class.
            For regression the column can have any numerical value
        label_list (:obj:`list` of :obj:`str`, optional): Used for classification;
            the names of the classes indexed by the values in label_col.
        categorical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that
            contain categorical features. The features can be already prepared numerically, or
            could be preprocessed by the method specified by categorical_encode_type
        numerical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain numerical features.
            These columns should contain only numeric values.
        sep_text_token_str (str, optional): The string token that is used to separate between the
            different text columns for a given data example. For Bert for example,
            this could be the [SEP] token.
        categorical_encode_type (str, optional): Given categorical_cols, this specifies
            what method we want to preprocess our categorical features.
            choices: [ 'ohe', 'binary', None]
            see encode_features.CategoricalFeatures for more details
        numerical_transformer_method (str, optional): Given numerical_cols, this specifies
            what method we want to use for normalizing our numerical data.
            choices: ['yeo_johnson', 'box_cox', 'quantile_normal', None]
            see https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
            for more details
        empty_text_values (:obj:`list` of :obj:`str`, optional): specifies what texts should be considered as
            missing which would be replaced by replace_empty_text
        replace_empty_text (str, optional): The value of the string that will replace the texts
            that match with those in empty_text_values. If this argument is None then
            the text that match with empty_text_values will be skipped
        max_token_length (int, optional): The token length to pad or truncate to on the
            input text
        debug (bool, optional): Whether or not to load a smaller debug version of the dataset

    Returns:
        :obj:`tuple` of `tabular_torch_dataset.TorchTextDataset`:
            This tuple contains the
            training, validation and testing sets. The val dataset is :obj:`None` if
            there is no `val.csv` in folder_path
    """
    train_df = pd.read_csv(join(folder_path, 'train.csv'), index_col=0)
    test_df = pd.read_csv(join(folder_path, 'test.csv'), index_col=0)
    if exists(join(folder_path, 'val.csv')):
        val_df = pd.read_csv(join(folder_path, 'val.csv'), index_col=0)
    else:
        val_df = None

    if categorical_encode_type == 'ohe' or categorical_encode_type == 'binary':
        dfs = [df for df in [train_df, val_df, test_df] if df is not None]
        data_df = pd.concat(dfs, axis=0)
        if categorical_encode_type == 'ohe':
            data_df = pd.get_dummies(data_df,
                                     columns=categorical_cols,
                                     dummy_na=True)
            categorical_cols = [
                col for col in data_df.columns for old_col in categorical_cols
                if col.startswith(old_col) and len(col) > len(old_col)
            ]
        elif categorical_encode_type == 'binary':
            cat_feat_processor = CategoricalFeatures(data_df, categorical_cols,
                                                     'binary')
            vals = cat_feat_processor.fit_transform()
            cat_df = pd.DataFrame(vals, columns=cat_feat_processor.feat_names)
            data_df = pd.concat([data_df, cat_df], axis=1)
            categorical_cols = cat_feat_processor.feat_names

        train_df = data_df.loc[train_df.index]
        if val_df is not None:
            val_df = data_df.loc[val_df.index]
        test_df = data_df.loc[test_df.index]

        categorical_encode_type = None

    if numerical_transformer_method != 'none':
        if numerical_transformer_method == 'yeo_johnson':
            numerical_transformer = PowerTransformer(method='yeo-johnson')
        elif numerical_transformer_method == 'box_cox':
            numerical_transformer = PowerTransformer(method='box-cox')
        elif numerical_transformer_method == 'quantile_normal':
            numerical_transformer = QuantileTransformer(
                output_distribution='normal')
        else:
            raise ValueError(f'preprocessing transformer method '
                             f'{numerical_transformer_method} not implemented')
        num_feats = load_num_feats(train_df, convert_to_func(numerical_cols))
        numerical_transformer.fit(num_feats)
    else:
        numerical_transformer = None

    train_dataset = load_data(train_df, text_cols, tokenizer, label_col,
                              label_list, categorical_cols, numerical_cols,
                              sep_text_token_str, categorical_encode_type,
                              numerical_transformer, empty_text_values,
                              replace_empty_text, max_token_length, debug)
    test_dataset = load_data(test_df, text_cols, tokenizer, label_col,
                             label_list, categorical_cols, numerical_cols,
                             sep_text_token_str, categorical_encode_type,
                             numerical_transformer, empty_text_values,
                             replace_empty_text, max_token_length, debug)

    if val_df is not None:
        val_dataset = load_data(val_df, text_cols, tokenizer, label_col,
                                label_list, categorical_cols, numerical_cols,
                                sep_text_token_str, categorical_encode_type,
                                numerical_transformer, empty_text_values,
                                replace_empty_text, max_token_length, debug)
    else:
        val_dataset = None

    return train_dataset, val_dataset, test_dataset
コード例 #30
0
          'seagreen', 'royalblue', 'darkorchid']

fig, axes = plt.subplots(nrows=8, ncols=3, figsize=plt.figaspect(2))
axes = axes.flatten()
axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),
             (13, 16, 19, 22), (14, 17, 20, 23)]
axes_list = [(axes[i], axes[j], axes[k], axes[l])
             for (i, j, k, l) in axes_idxs]


for distribution, color, axes in zip(distributions, colors, axes_list):
    name, X = distribution
    X_train, X_test = train_test_split(X, test_size=.5)

    # perform power transforms and quantile transform
    X_trans_bc = bc.fit(X_train).transform(X_test)
    lmbda_bc = round(bc.lambdas_[0], 2)
    X_trans_yj = yj.fit(X_train).transform(X_test)
    lmbda_yj = round(yj.lambdas_[0], 2)
    X_trans_qt = qt.fit(X_train).transform(X_test)

    ax_original, ax_bc, ax_yj, ax_qt = axes

    ax_original.hist(X_train, color=color, bins=BINS)
    ax_original.set_title(name, fontsize=FONT_SIZE)
    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)

    for ax, X_trans, meth_name, lmbda in zip(
            (ax_bc, ax_yj, ax_qt),
            (X_trans_bc, X_trans_yj, X_trans_qt),
            ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
コード例 #31
0
ファイル: data_cleaning_flow.py プロジェクト: crawftv/crawto
def fit_yeo_johnson_transformer(train_imputed_numeric_df: pd.DataFrame):
    yeo_johnson_transformer = PowerTransformer(method="yeo-johnson", copy=True)
    yeo_johnson_transformer.fit(train_imputed_numeric_df)
    return yeo_johnson_transformer