Python QuantileTransformerの例、sklearn.preprocessing.QuantileTransformer Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self, df, model=Models.PROPHET,
                 upsample_freq=None,
                 train_test_split_ratio=Constants.TRAIN_TEST_SPLIT_RATIO.value,
                 epochs=Constants.EPOCHS.value,
                 initial_epoch=Constants.INITIAL_EPOCH.value,
                 batch_size=Constants.BATCH_SIZE.value,
                 sliding_window_size_or_time_steps=Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value,
                 do_shuffle=True):
        logging.info("resample: {}. future_prediction: {}, epochs: {}, batch_size: {},"
                     " window_size: {}, eurons: {}"
                     .format(Constants.RESAMPLING_FREQ.value
                             , Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value
                             , epochs
                             , batch_size
                             , sliding_window_size_or_time_steps
                             , Constants.NEURONS.value
                             ))
        if logging.getLogger().isEnabledFor(logging.INFO):
            explore_data(df)
        # first step is to create a timestamp column as index to turn it to a TimeSeries data
        df.index = pd.to_datetime(df[ColumnNames.DATE.value] + df[ColumnNames.TIME.value],
                                  format='%Y-%m-%d%H:%M:%S', errors='raise')
        if 'Unnamed: 0' in df.columns:
            df.drop('Unnamed: 0', axis=1, inplace=True)

        # keep a copy of original dataset for future comparison
        self.df_original = df.copy()

        # we interpolate temperature using prophet to use it in a multivariate forecast
        temperature = ColumnNames.TEMPERATURE.value
        interpolated_df = facebook_prophet_filter(df, temperature,
                                                  Constants.FORECASTED_TEMPERATURE_FILE.value)
        interpolated_df.index = df.index
        df[[temperature]] = interpolated_df[[ColumnNames.FORECAST.value]]

        # lets also interpolate missing kwh using facebook prophet (or we could simply drop them)

        # now turn to kwh and make the format compatible with prophet
        power = ColumnNames.POWER.value
        interpolated_df = facebook_prophet_filter(df, power,
                                                  Constants.FORECASTED_POWER_FILE.value)
        interpolated_df.index = df.index
        df[[power]] = interpolated_df[[ColumnNames.FORECAST.value]]

        df = df.rename(columns={power: ColumnNames.LABEL.value})
        df.drop(columns=[ColumnNames.DATE.value,
                         ColumnNames.TIME.value,
                         ColumnNames.DAY_OF_WEEK.value,
                         ColumnNames.MONTH.value],
                inplace=True
                )
        if upsample_freq is not None:
            df = df.resample(upsample_freq).mean()

        # for any regression or forecasting it is better to work with normalized data
        self.transformer = QuantileTransformer()  # handle outliers better than MinMaxScalar
        features = ColumnNames.FEATURES.value
        normalized = normalize(df, features, transformer=self.transformer)

        # we use the last part (after 12/1/2013) that doesnt have temperature for testing
        cutoff_date = Constants.CUTOFF_DATE.value
        self.df = normalized[normalized.index < cutoff_date]
        self.testing = normalized[normalized.index >= cutoff_date]

        self.df[ColumnNames.DATE_STAMP.value] = self.df.index
        self.df_blocked = None
        self.train_test_split_ratio = train_test_split_ratio
        self.model_type = model
        self.train_X, self.test_X, self.train_test_split_index = self.train_test_split(self.df[features])
        self.train_y, self.test_y, _ = self.train_test_split(self.df[ColumnNames.LABELS.value])
        self.model_fit = None
        self.epochs = epochs
        self.initial_epoch = initial_epoch
        self.batch_size = batch_size
        self.history = None
        # following is defines in sliding_window
        self.do_shuffle = do_shuffle
        self.val_idx = None
        self.shuffled_X = None
        self.shuffled_y = None
        self.train = None
        self.label = None
        self.train_size = None
        self.val_size = None

        if logging.getLogger().isEnabledFor(logging.INFO):
            explore_data(self.df)

コード例 #2

0

ファイルを表示

# }
lgb_param = utility.json2param('magic')
lgb_clf0 = lgb.LGBMClassifier(**lgb_param)
lgb_clf1 = lgb.LGBMClassifier(**lgb_param)
lgb_clf2 = lgb.LGBMClassifier(**lgb_param)
#catboost
import catboost
cat_param = utility.json2param('catboost')
#cat_clf= catboost.CatBoostClassifier(**cat_param)

# create NB clf
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
#NB_clf = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB(n_classes=2))
NB_clf = make_pipeline(QuantileTransformer(output_distribution='normal'),
                       GaussianNB())
import Stacking
# make stacking
# from sklearn.model_selection import StratifiedKFold
# kfold = StratifiedKFold(n_splits=2, random_state=999).split(X,y)
#kfold is generator that will destory itself after one usage
clf_list = [lgb_clf0, lgb_clf1, lgb_clf2, NB_clf]
layer0 = Stacking.layering(clf_list)
layer0_out = layer0.fit_blend(X, y, cv=10)

# #last layer(meta)
# from sklearn.linear_model import LogisticRegression
# meta_clf = LogisticRegression(n_jobs=4,random_state=123)
#
# # meta_clf.fit(layer0_out,y.reshape(-1,1))

コード例 #3

0

ファイルを表示

# In[91]:


classifier_pipeline = Pipeline(steps = [       
    ('feature_processing', ColumnTransformer(transformers = [        
            #binary
            ('binary', Pipeline([
                ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))]),
            binary_features), 
                    
            #numeric
            ('numeric', Pipeline([
                ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
                ('scale', RobustScaler()),
                ('transform', QuantileTransformer(output_distribution='normal')),
                ('engineer', PolynomialFeatures())]),
            numerical_features),
        
            #categorical
            ('categorical', Pipeline([
                ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-10000)),
                ('toint', FunctionTransformer(lambda x: x.astype('int64')))
]),
             #                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
            categorical_features),
            
    ])),
    ]
)

コード例 #4

0

ファイルを表示

    test = data[data.target.isnull()].copy()
    target_col = "target"
    drop_cols = ["spectrum_id", "spectrum_filename", "chip_id"]
    X_train = train.drop(drop_cols + [target_col], axis=1)
    y_train = train[target_col].values
    X_test = test.drop(drop_cols + [target_col], axis=1)

    # fill inf/nan
    X_train.replace(np.inf, np.nan, inplace=True)
    X_test.replace(np.inf, np.nan, inplace=True)
    X_train.fillna(X_train.mean(), inplace=True)
    X_test.fillna(X_train.mean(), inplace=True)

    # rankgauss transform
    # https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629
    prep = QuantileTransformer(output_distribution="normal")
    X_cont_train = prep.fit_transform(X_train)
    X_cont_test = prep.transform(X_test)

    # train and predict
    timestamp = get_timestamp()
    oof_preds, test_preds, cv_scores = run(
        X_seq_train,
        X_cont_train,
        y_train,
        X_seq_test,
        X_cont_test,
        timestamp,
        random_state=0,
    )

コード例 #5

0

ファイルを表示

ファイル: feature_scaling_power_distribution.py プロジェクト: pieroit/machine-learning-tutorial

data = np.random.power(5, (100, 2))
#data = np.column_stack((x, y))

# scaliamo i dati in vari modi
all_scalings = {
    'A_non_scaled':
    data,
    'B_min_max':
    MinMaxScaler().fit_transform(data),
    'C_standard':
    StandardScaler().fit_transform(data),
    'D_robust':
    RobustScaler().fit_transform(data),
    'E_quantile_uniform':
    QuantileTransformer(output_distribution='uniform').fit_transform(data),
    'F_quantile_normal':
    QuantileTransformer(output_distribution='normal').fit_transform(data)
}

# plot
i = 0
for scaling in sorted(all_scalings.keys()):
    i += 1
    plt.subplot('32' + str(i))
    plt.title(scaling)
    x = all_scalings[scaling][:, 0]
    y = all_scalings[scaling][:, 1]
    plt.scatter(x=x, y=y)
plt.tight_layout()
plt.show()

コード例 #6

0

ファイルを表示

    def fit(self):
        """
        perform model fitting        
        """

        # initialize
        y_vals = np.zeros((self.train_df.shape[0], ))
        if self.task == "multiclass":
            n_class = len(np.unique(self.train_df[self.target].values))
            oof_pred = np.zeros((self.train_df.shape[0], n_class))
            y_pred = np.zeros((self.test_df.shape[0], n_class))
        else:
            oof_pred = np.zeros((self.train_df.shape[0], ))
            y_pred = np.zeros((self.test_df.shape[0], ))

        # group does not kick in when group k fold is used
        if self.group is not None:
            if self.group in self.features:
                self.features.remove(self.group)
            if self.group in self.categoricals:
                self.categoricals.remove(self.group)
        fi = np.zeros((self.n_splits, len(self.features)))

        # target encoding
        numerical_features = [f for f in self.features if f not in self.categoricals]
        if self.target_encoding:            
            # perform target encoding
            overall_mean = self.train_df[self.target].mean()
            for c in self.categoricals:
                data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values})
                tmp = np.nan * np.ones(self.train_df.shape[0])
                
                cv = self.get_cv()
                for fold, (train_idx, val_idx) in enumerate(cv):
                    target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean()
                    tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean).values
                self.train_df[c] = tmp
                
                # replace categorical variable in test
                target_mean = data_tmp.groupby(c)['target'].mean()
                self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values
            
            # no categoricals any more
            numerical_features = self.features.copy()
            self.categoricals = []
        
        # fill nan
        if self.model not in ['lgb', 'catb', 'xgb']:
            # fill NaN (numerical features -> median, categorical features -> mode)
            self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median())
            self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median())
            self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0])
            self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0])
      
        # scaling, if necessary
        if self.scaler is not None:
            # to normal
            pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal")
            self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features])

            # starndardize
            if self.scaler == "MinMax":
                scaler = MinMaxScaler()
            elif self.scaler == "Standard":
                scaler = StandardScaler()
            self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features])

            x_test = self.test_df.copy()
            if self.model == "nn":
                x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]]
            else:
                x_test = x_test[self.features]
        else:
            x_test = self.test_df[self.features]
        
        # fitting with out of fold
        cv = self.get_cv()
        for fold, (train_idx, val_idx) in enumerate(cv):
            # train test split
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx]

            if self.model == "nn":
                x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]]
                x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]]

            # model fitting
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model, importance = self.train_model(train_set, val_set)
            fi[fold, :] = importance
            y_vals[val_idx] = y_val

            # predictions and check cv score
            oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task)
            y_pred += ypred.reshape(y_pred.shape) / self.n_splits
            if self.task == "multiclass":
                oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    np.argmax(oof_pred[val_idx, :], axis=1))))
            else:
                oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    oof_pred[val_idx])))

        # feature importance data frame
        fi_df = pd.DataFrame()
        for n in np.arange(self.n_splits):
            tmp = pd.DataFrame()
            tmp["features"] = self.features
            tmp["importance"] = fi[n, :]
            tmp["fold"] = n
            fi_df = pd.concat([fi_df, tmp], ignore_index=True)
        gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index()
        fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean'))

        # outputs
        if self.task == "multiclass":
            loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1))
        else:
            loss_score = self.calc_metric(y_vals, oof_pred)

        if self.verbose:
            print('Our oof loss score is: ', loss_score)
        return y_pred, loss_score, model, oof_pred, y_vals, fi_df

コード例 #7

0

ファイルを表示

class Dataset:
    """
    Class to generate the data matrices (train, validation and test)
    """

    ## Train X matrix
    train_x = None
    ## Train y matrix
    train_y = None
    ## Validation X matrix
    val_x = None
    ## Validation y matrix
    val_y = None
    ## Test X matrix
    test_x = None
    ## Test y matrix
    test_y = None
    ## Path to the datafiles
    data_path = None
    ## Section 'data' of the configuration file
    config = None
    ## Mode of the dataset
    mode = None
    ## Functions to use for scaling the data
    scalers = {
        'standard': StandardScaler(),
        'minmax': MinMaxScaler(feature_range=(-1, 1)),
        'tanh': tanh_normalization(),
        'robustscaler': RobustScaler(),
        'quantile': QuantileTransformer()
    }
    ## Strings corresponding to the different dataset configurations
    dataset_type = [
        'onesiteonevar', 'onesitemanyvar', 'manysiteonevar', 'manysitemanyvar',
        'manysitemanyvarstack', 'manysitemanyvarstackneigh'
    ]
    generated = False
    raw_data = None
    scaler = None  # Scaler object so data can be rescaled after training

    def __init__(self, config, data_path):
        """
        Initializes the object with the data configuration section of the configuration file and
        the path where the actual data is

        :param config:
        :param data_path:
        """
        self.config = config
        self.data_path = data_path

    def is_teacher_force(self):
        """
        Returns if the data matrix is configured for teaching force

        :return:
        """
        return self.config['dmatrix'] == 'teach_force'

    def is_dependent_auxiliary(self):
        """
        Returns if the data matrix is cofigured to separate dependent and independent variables

        :return:
        """
        return self.config['dmatrix'] == 'dep_aux'

    def _generate_dataset_one_var(self,
                                  data,
                                  datasize,
                                  testsize,
                                  lag=1,
                                  ahead=1,
                                  slice=1,
                                  mode=None):
        """
        Generates dataset matrices for one variable according to the lag and ahead horizon. The ahead horizon can be
        sliced to a subset of the horizon

        The dimensions of the matrix are adapted accordingly to the input and output dimensions of the model

        Input:
            By default is a 3D matrix - examples x variables x lag
            2D - examples x (variables * lag)
        Output:
            3D - examples x horizon x 1
            2D - examples x horizon
            1D - examples x 1 x 1
            0D - examples x 1

        'scaling' is obtained from the data section of the configuration
        'fraction' allows selecting only a part of the data, selects from the end

        :param data:
        :param datasize:
        :param testsize:
        :param lag:
        :param ahead:
        :param slice:
        :param mode:
        :return:
        :return:
        """
        if 'scaler' in self.config and self.config['scaler'] in self.scalers:
            scaler = self.scalers[self.config['scaler']]
            tmpdata = scaler.fit_transform(data)
            self.scaler = scaler.fit(data[:, 0].reshape(
                -1,
                1))  # saves the scaler for the first variable for descaling
            data = tmpdata

        # else:
        #    scaler = StandardScaler()
        #    data = scaler.fit_transform(data)

        mode_x, mode_y = mode

        if 'fraction' in self.config:
            isize = int((1 - self.config['fraction']) * datasize)
            wind_train = data[isize:datasize, :]
        else:
            wind_train = data[:datasize, :]

        train = lagged_vector(wind_train, lag=lag, ahead=ahead, mode=mode)
        train_x = train[:, :lag]

        #######################################
        if mode_x == '2D':
            train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1]))
        elif mode_x == '4D':
            raise NameError('4D is not possible when there is only a variable')
        # Default is '3D'

        if mode_y == '3D':
            train_y = train[:, -slice:, 0]
            train_y = np.reshape(train_y,
                                 (train_y.shape[0], train_y.shape[1], 1))
        elif mode_y == '2D':
            train_y = train[:, -slice:, 0]
            train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1]))
        elif mode_y == '1D':
            train_y = train[:, -1:, 0]
        elif mode_y == '0D':
            train_y = np.ravel(train[:, -1:, 0])
        else:
            train_y = train[:, -1:, 0]

        wind_test = data[datasize:datasize + testsize, 0].reshape(-1, 1)
        test = lagged_vector(wind_test, lag=lag, ahead=ahead, mode=mode)
        half_test = int(test.shape[0] / 2)
        val_x = test[:half_test, :lag]
        test_x = test[half_test:, :lag]

        #######################################
        if mode_x == '2D':
            val_x = np.reshape(val_x, (val_x.shape[0], val_x.shape[1]))
            test_x = np.reshape(test_x, (test_x.shape[0], test_x.shape[1]))
        elif mode_x == '4D':
            raise NameError('4D is not possible when there is only a variable')
        # Default is '3D'

        if mode_y == '3D':
            val_y = test[:half_test, -slice:, 0]
            test_y = test[half_test:, -slice:, 0]
            val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1], 1))
            test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1], 1))
        elif mode_y == '2D':
            val_y = test[:half_test, -slice:, 0]
            test_y = test[half_test:, -slice:, 0]
            val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1]))
            test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1]))
        elif mode_y == '1D':
            val_y = test[:half_test, -1:, 0]
            test_y = test[half_test:, -1:, 0]
        elif mode_y == '0D':
            val_y = np.ravel(test[:half_test, -1:, 0])
            test_y = np.ravel(test[half_test:, -1:, 0])
        else:  # Default is '1D'
            val_y = test[:half_test, -1:, 0]
            test_y = test[half_test:, -1:, 0]

        return train_x, train_y, val_x, val_y, test_x, test_y

    def _generate_dataset_multiple_var(self,
                                       data,
                                       datasize,
                                       testsize,
                                       lag=1,
                                       ahead=1,
                                       slice=1,
                                       mode=None):
        """
        Generates dataset matrices for one variable according to the lag and ahead horizon. The ahead horizon can be
        sliced to a subset of the horizon

        The dimensions of the matrix are adapted accordingly to the input and output dimensions of the model

        Input:
            By default is a 3D matrix - examples x lag x variables
            2D - examples x (lag * variables)
        Output:
            3D - examples x horizon x 1
            2D - examples x horizon
            1D - examples x 1 x 1
            0D - examples x 1

        'scaling' is obtained from the data section of the configuration
        'fraction' allows selecting only a part of the data, selects from the end

        :return:
        """
        if 'scaler' in self.config and self.config['scaler'] in self.scalers:
            scaler = self.scalers[self.config['scaler']]
            tmpdata = scaler.fit_transform(data)
            self.scaler = scaler.fit(data[:, 0].reshape(
                -1,
                1))  # saves the scaler for the first variable for descaling
            data = tmpdata
        # else:
        #    scaler = StandardScaler()
        #    data = scaler.fit_transform(data)
        # print('DATA Dim =', data.shape)

        mode_x, mode_y = mode

        if 'fraction' in self.config:
            isize = int((1 - self.config['fraction']) * datasize)
            wind_train = data[isize:datasize, :]
        else:
            self.config['fraction'] = 1
            wind_train = data[:datasize, :]

        # print('Train Dim =', wind_train.shape)

        # Train
        train = lagged_matrix(wind_train, lag=lag, ahead=ahead, mode=mode)
        train_x = train[:, :lag]

        if 'aggregate' in self.config and 'x' in self.config['aggregate']:
            step = self.config['aggregate']['x']['step']
            if self.config['aggregate']['x']['method'] == 'average':
                train_x = aggregate_average_all(train_x, step)
            elif self.config['aggregate']['x']['method'] == 'max':
                train_x = aggregate_max_min_all(train_x, step, aggmax=True)
            elif self.config['aggregate']['x']['method'] == 'min':
                train_x = aggregate_max_min_all(train_x, step, aggmax=False)

        # Signal decomposition
        if 'decompose' in self.config and 'x' in self.config['decompose']:
            components = self.config['decompose']['x']['components']
            if type(self.config['decompose']['x']['var']) == int:
                var = self.config['decompose']['x']['var']
                train_x = apply_SSA_decomposition_one(var, components, train_x)
            else:
                train_x = apply_SSA_decomposition_all(components, train_x)

        #######################################
        print('pollo', mode_y)
        if mode_x == '2D':
            # Interchange axes 1 and 2 so the variables values are contiguous in the 2D matrix
            train_x = np.swapaxes(train_x, 1, 2)
            train_x = np.reshape(
                train_x,
                (train_x.shape[0], train_x.shape[1] * train_x.shape[2]))
        elif mode_x == '4D':
            # Add an extra dimension to simulate that we have only one channel
            train_x = np.reshape(
                train_x,
                (train_x.shape[0], train_x.shape[1], train_x.shape[2], 1))

        if mode_y == '3D':
            train_y = train[:, -slice:, 0]
            if 'aggregate' in self.config and 'y' in self.config['aggregate']:
                step = self.config['aggregate']['y']['step']
                if self.config['aggregate']['y']['method'] == 'average':
                    train_y = aggregate_average(train_y, step)
                elif self.config['aggregate']['y']['method'] == 'max':
                    train_y = aggregate_max_min(train_y, step, aggmax=True)
                elif self.config['aggregate']['y']['method'] == 'min':
                    train_y = aggregate_max_min(train_y, step, aggmax=False)
            # Decompose prediction and keep one of the components
            if 'decompose' in self.config and 'y' in self.config['decompose']:
                components = self.config['decompose']['y']['components']
                dec_y = apply_SSA_decomposition_y(components, train_y)
                train_y = dec_y[:, :, self.config['decompose']['y']['var']]

            # We need an additional third dimension
            train_y = np.reshape(train_y,
                                 (train_y.shape[0], train_y.shape[1], 1))
        elif mode_y == '2D':
            train_y = train[:, -slice:, 0]
            if 'aggregate' in self.config and 'y' in self.config['aggregate']:
                print('hello pollastre',
                      self.config['aggregate']['y']['method'])
                step = self.config['aggregate']['y']['step']
                if self.config['aggregate']['y']['method'] == 'average':
                    train_y = aggregate_average(train_y, step)
                elif self.config['aggregate']['y']['method'] == 'max':
                    train_y = aggregate_max_min(train_y, step, aggmax=True)
                elif self.config['aggregate']['y']['method'] == 'min':
                    train_y = aggregate_max_min(train_y, step, aggmax=False)
            # Decompose prediction and keep one of the components
            if 'decompose' in self.config and 'y' in self.config['decompose']:
                components = self.config['decompose']['y']['components']
                dec_y = apply_SSA_decomposition_y(components, train_y)
                train_y = dec_y[:, :, self.config['decompose']['y']['var']]

            train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1]))
        elif mode_y == '1D':
            train_y = train[:, -1:, 0]
        elif mode_y == '0D':
            train_y = np.ravel(train[:, -1:, 0])
        else:
            train_y = train[:, -slice:, 0]

        # Test and Val
        wind_test = data[datasize:datasize + testsize, :]
        test = lagged_matrix(wind_test, lag=lag, ahead=ahead, mode=mode)

        half_test = int(test.shape[0] / 2)
        val_x = test[:half_test, :lag]
        test_x = test[half_test:, :lag]

        if 'aggregate' in self.config and 'x' in self.config['aggregate']:
            step = self.config['aggregate']['x']['step']
            if self.config['aggregate']['x']['method'] == 'average':
                val_x = aggregate_average_all(val_x, step)
                test_x = aggregate_average_all(test_x, step)
            elif self.config['aggregate']['x']['method'] == 'max':
                val_x = aggregate_max_min_all(val_x, step, aggmax=True)
                test_x = aggregate_max_min_all(test_x, step, aggmax=True)
            elif self.config['aggregate']['x']['method'] == 'min':
                val_x = aggregate_max_min_all(val_x, step, aggmax=False)
                test_x = aggregate_max_min_all(test_x, step, aggmax=False)

        if 'decompose' in self.config and 'x' in self.config['decompose']:
            components = self.config['decompose']['x']['components']
            if type(self.config['decompose']['x']['var']) == int:
                var = self.config['decompose']['x']['var']
                val_x = apply_SSA_decomposition_one(var, components, val_x)
                test_x = apply_SSA_decomposition_one(var, components, test_x)
            else:
                val_x = apply_SSA_decomposition_all(components, val_x)
                test_x = apply_SSA_decomposition_all(components, test_x)

        ########################################################
        if mode_x == '2D':
            val_x = np.swapaxes(val_x, 1, 2)
            val_x = np.reshape(
                val_x, (val_x.shape[0], val_x.shape[1] * val_x.shape[2]))
            test_x = np.swapaxes(test_x, 1, 2)
            test_x = np.reshape(
                test_x, (test_x.shape[0], test_x.shape[1] * test_x.shape[2]))
        elif mode_x == '4D':
            # Add an extra dimension to simulate that we have only one channel
            val_x = np.reshape(
                val_x, (val_x.shape[0], val_x.shape[1], val_x.shape[2], 1))
            test_x = np.reshape(
                test_x, (test_x.shape[0], test_x.shape[1], test_x.shape[2], 1))

        if mode_y == '3D':
            val_y = test[:half_test, -slice:, 0]
            test_y = test[half_test:, -slice:, 0]
            if 'aggregate' in self.config and 'y' in self.config['aggregate']:
                step = self.config['aggregate']['step']
                if self.config['aggregate']['method'] == 'average':
                    val_y = aggregate_average(val_y, step)
                    test_y = aggregate_average(test_y, step)
                elif self.config['aggregate']['method'] == 'max':
                    val_y = aggregate_max_min(val_y, step, aggmax=True)
                    test_y = aggregate_max_min(test_y, step, aggmax=True)
                elif self.config['aggregate']['method'] == 'min':
                    val_y = aggregate_max_min(val_y, step, aggmax=False)
                    test_y = aggregate_max_min(test_y, step, aggmax=False)
            # Decompose prediction and keep one of the components
            if 'decompose' in self.config and 'y' in self.config['decompose']:
                components = self.config['decompose']['y']['components']
                dec_y = apply_SSA_decomposition_y(components, val_y)
                val_y = dec_y[:, :, self.config['decompose']['y']['var']]
                dec_y = apply_SSA_decomposition_y(components, test_y)
                test_y = dec_y[:, :, self.config['decompose']['y']['var']]

            val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1], 1))
            test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1], 1))
        elif mode_y == '2D':
            val_y = test[:half_test, -slice:, 0]
            test_y = test[half_test:, -slice:, 0]
            if 'aggregate' in self.config and 'y' in self.config['aggregate']:
                step = self.config['aggregate']['y']['step']
                if self.config['aggregate']['y']['method'] == 'average':
                    val_y = aggregate_average(val_y, step)
                    test_y = aggregate_average(test_y, step)
                elif self.config['aggregate']['y']['method'] == 'max':
                    val_y = aggregate_max_min(val_y, step, aggmax=True)
                    test_y = aggregate_max_min(test_y, step, aggmax=True)
                elif self.config['aggregate']['y']['method'] == 'min':
                    val_y = aggregate_max_min(val_y, step, aggmax=False)
                    test_y = aggregate_max_min(test_y, step, aggmax=False)
            if 'decompose' in self.config and 'y' in self.config['decompose']:
                # Decompose prediction and keep one of the components
                components = self.config['decompose']['y']['components']
                dec_y = apply_SSA_decomposition_y(components, val_y)
                val_y = dec_y[:, :, self.config['decompose']['y']['var']]
                dec_y = apply_SSA_decomposition_y(components, test_y)
                test_y = dec_y[:, :, self.config['decompose']['y']['var']]

            val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1]))
            test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1]))
        elif mode_y == '1D':
            val_y = test[:half_test, -1:, 0]
            test_y = test[half_test:, -1:, 0]
        elif mode_y == '0D':
            val_y = np.ravel(test[:half_test, -1:, 0])
            test_y = np.ravel(test[half_test:, -1:, 0])
        else:
            val_y = test[:half_test, -slice:, 0]
            test_y = test[half_test:, -slice:, 0]

        return train_x, train_y, val_x, val_y, test_x, test_y

    def load_raw_data(self, remote=False):
        """
        Loads the data so some computations can be performed
        :return:
        """
        datanames = self.config['datanames']
        d = datanames[0]  # just the main dataset

        vars = self.config['vars']
        if 'angle' in self.config:
            angle = self.config['angle']
        else:
            angle = False

        if remote:
            srv = pysftp.Connection(host=remote_data[0],
                                    username=remote_data[1])
            srv.get(remote_wind_data_path + f"/{d}.npy",
                    self.data_path + f"/{d}.npy")
            srv.close()
        if angle:
            wind = np.load(self.data_path + '_angle' + f"/{d}.npy")
        else:
            wind = np.load(self.data_path + f"/{d}.npy")

        if remote:
            os.remove(self.data_path + f"/{d}.npy")

        # If there is a list in vars attribute it should be a list of integers
        if type(vars) == list:
            for v in vars:
                if type(v) != int or v > wind.shape[1]:
                    raise NameError('Error in variable selection')
            wind = wind[:, vars]
        self.raw_data = wind

    def generate_dataset(self,
                         ahead=1,
                         mode=None,
                         ensemble=False,
                         ens_slice=None,
                         remote=None):
        """
        Generates the dataset for training, test and validation

          0 = One site - wind
          1 = One site - all variables
          2 = All sites - wind
          3 = All sites - all variables
          4 = All sites - all variables stacked
          5 = Uses neighbor sites around a radius

        :param ens_slice: (not yet used)
        :param remote: Use remote data
        :param ensemble: (not yet used)
        :param datanames: Name of the wind datafiles
        :param ahead: number of steps ahead for prediction
        :param mode: type of dataset (pair indicating the type of dimension for input and output)
        :return:
        """
        self.generated = True
        self.mode = mode

        datanames = self.config['datanames']
        datasize = self.config['datasize']
        testsize = self.config['testsize']

        lag = self.config['lag']
        vars = self.config['vars']
        wind = {}
        if 'angle' in self.config:
            angle = self.config['angle']
        else:
            angle = False

        # ahead = self.config['ahead'] if (type(self.config['ahead']) == list) else [1, self.config['ahead']]

        if type(ahead) == list:
            dahead = ahead[1]
            slice = (ahead[1] - ahead[0]) + 1
        else:
            dahead = ahead
            slice = ahead

        # Augment the dataset with the closest neighbors
        if self.config['dataset'] == 5 or self.config['dataset'] == 31:
            if 'radius' not in self.config:
                raise NameError(
                    "Radius missing for neighbours augmented dataset")
            else:
                radius = self.config['radius']
            if 'nneighbors' in self.config:
                datanames = get_closest_k_neighbors(datanames[0], radius,
                                                    self.config['nneighbors'])
            else:
                print('before', datanames)
                datanames = get_all_neighbors(datanames[0], radius)
                print('after', datanames)
        # Reads numpy arrays for all sites and keeps only selected columns
        for d in datanames:
            if remote:
                srv = pysftp.Connection(host=remote_data[0],
                                        username=remote_data[1])
                srv.get(remote_wind_data_path + f"/{d}.npy",
                        self.data_path + f"/{d}.npy")
                srv.close()
            if angle:
                wind[d] = np.load(self.data_path + '_angle' + f"/{d}.npy")
            else:
                wind[d] = np.load(self.data_path + f"/{d}.npy")
            if remote:
                os.remove(self.data_path + f"/{d}.npy")

            # If there is a list in vars attribute it should be a list of integers
            if type(vars) == list:
                for v in vars:
                    if type(v) != int or v > wind[d].shape[1]:
                        raise NameError('Error in variable selection')
                wind[d] = wind[d][:, vars]

        if (self.config['dataset'] == 0) or (self.config['dataset']
                                             == 'onesiteonevar'):
            if not ensemble:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_one_var(wind[datanames[0]][:, 0].reshape(-1, 1), datasize, testsize,
                                                   lag=lag, ahead=dahead, slice=slice, mode=mode)
            else:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_one_var(wind[datanames[0]][ens_slice[0]::ens_slice[1], 0].reshape(-1, 1),
                                                   datasize, testsize,
                                                   lag=lag, ahead=dahead, slice=slice, mode=mode)

        elif (self.config['dataset'] == 1) or (self.config['dataset']
                                               == 'onesitemanyvar'):
            if not ensemble:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_multiple_var(wind[datanames[0]], datasize, testsize,
                                                        lag=lag, ahead=dahead, slice=slice, mode=mode)
            else:
                self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                    self._generate_dataset_multiple_var(wind[datanames[0][ens_slice[0]::ens_slice[1], :]], datasize,
                                                        testsize,
                                                        lag=lag, ahead=dahead, slice=slice, mode=mode)

        elif self.config['dataset'] == 2 or self.config[
                'dataset'] == 'manysiteonevar':
            stacked = np.vstack([wind[d][:, 0] for d in datanames]).T
            self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                self._generate_dataset_multiple_var(stacked, datasize, testsize,
                                                    lag=lag, ahead=dahead, slice=slice, mode=mode)
        elif self.config['dataset'] == 3 or self.config[
                'dataset'] == 31 or self.config['dataset'] == 'manysitemanyvar':
            stacked = np.hstack([wind[d] for d in datanames])
            self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \
                self._generate_dataset_multiple_var(stacked, datasize, testsize,
                                                    lag=lag, ahead=dahead, slice=slice, mode=mode)
        elif self.config['dataset'] == 4 or self.config['dataset'] == 5 or \
                self.config['dataset'] == 'manysitemanyvarstack':
            stacked = [
                self._generate_dataset_multiple_var(wind[d],
                                                    datasize,
                                                    testsize,
                                                    lag=lag,
                                                    ahead=dahead,
                                                    slice=slice,
                                                    mode=mode)
                for d in datanames
            ]

            self.train_x = np.vstack([x[0] for x in stacked])
            self.train_y = np.vstack([x[1] for x in stacked])

            self.val_x = stacked[0][2]
            self.val_y = stacked[0][3]
            self.test_x = stacked[0][4]
            self.test_y = stacked[0][5]
        else:
            raise NameError('ERROR: No such dataset type')

    def get_data_matrices(self):
        """
        Returns the data matrices for training, validation and test

        :return:
        """

        if not 'dmatrix' in self.config or self.config['dmatrix'] == 'normal':
            return self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y
        elif self.config['dmatrix'] == 'teach_force':
            return self.teacher_forcing()
        elif self.config['dmatrix'] == 'dep_aux':
            return self.dependent_auxiliary()
        elif self.config['dmatrix'] == 'future':
            return self.auxiliary_future()
        else:
            raise NameError("DataSet: No such dmatrix type")

    def teacher_forcing(self):
        """
        returns data matrices for teacher forcing/attention assuming that data is for RNN

        :return:
        """
        # Use the last element of wind traininig data as the first of teacher forcing
        tmp = self.train_x[:, -1, 0]
        tmp = tmp.reshape(tmp.shape[0], 1, 1)
        train_y_tf = np.concatenate((tmp, self.train_y[:, :-1, :]), axis=1)

        tmp = self.test_x[:, -1, 0]
        tmp = tmp.reshape(tmp.shape[0], 1, 1)
        test_y_tf = np.concatenate((tmp, self.test_y[:, :-1, :]), axis=1)

        tmp = self.val_x[:, -1, 0]
        tmp = tmp.reshape(tmp.shape[0], 1, 1)
        val_y_tf = np.concatenate((tmp, self.val_y[:, :-1, :]), axis=1)
        return [self.train_x, train_y_tf], self.train_y, \
               [self.val_x, val_y_tf], self.val_y, \
               [self.test_x, test_y_tf], self.test_y

    def dependent_auxiliary(self):
        """
        Return data matrices separating dependent variable from the rest

        This is for two headed architecture with dependent and auxiliary
        variables in separated branches
        :return:
        """
        horizon = self.config['lag']
        if self.mode[1] != '2D':
            return [self.train_x[:, :, 0].reshape(self.train_x.shape[0], self.train_x.shape[1], 1),
                    self.train_x[:, :, 1:]], self.train_y, \
                   [self.val_x[:, :, 0].reshape(self.val_x.shape[0], self.val_x.shape[1], 1),
                    self.val_x[:, :, 1:]], self.val_y, \
                   [self.test_x[:, :, 0].reshape(self.test_x.shape[0], self.test_x.shape[1], 1),
                    self.test_x[:, :, 1:]], self.test_y
        else:
            return [self.train_x[:, :horizon].train_x[:, horizon:, ]], self.train_y, \
                   [self.val_x[:, :horizon], self.val_x[:, :horizon]], self.val_y, \
                   [self.test_x[:, :horizon], self.test_x[:, :horizon]], self.test_y

    def auxiliary_future(self):
        """
        Returns data matrices adding a matrix for the future for a subset of the auxiliary matrices

        :return:
        """
        # Future variable, just one for now
        datalag = self.config['lag']
        future = self.config['varsf'][0]
        ahead = self.config['ahead'] if (type(
            self.config['ahead']) == list) else [1, self.config['ahead']]
        if type(ahead) == list:
            dahead = ahead[1]
            slice = (ahead[1] - ahead[0]) + 1
        else:
            dahead = ahead
            slice = ahead

        if self.mode[1] != '2D':
            # The values of the future variable are dahead positions from the start
            train_x_future = self.train_x[dahead:, -slice:, future]
            val_x_future = self.val_x[dahead:, -slice:, future]
            test_x_future = self.test_x[dahead:, -slice:, future]
        else:
            nvars = len(self.config['vars'])
            train_x_future = self.train_x[datalag - 1:, (future * datalag) +
                                          ahead[0]:(future * datalag) +
                                          ahead[0] + slice]
            val_x_future = self.val_x[datalag - 1:, (future * datalag) +
                                      ahead[0]:(future * datalag) + ahead[0] +
                                      slice]
            test_x_future = self.test_x[datalag - 1:, (future * datalag) +
                                        ahead[0]:(future * datalag) +
                                        ahead[0] + slice]

        # We lose the last datalag-1 examples because we do not have their full future in the data matrix
        return [self.train_x[:-(datalag - 1)], train_x_future], self.train_y[:-(datalag - 1)], [
            self.val_x[:-(datalag - 1)], val_x_future], self.val_y[:-(datalag - 1)], \
               [self.test_x[:-(datalag - 1)], test_x_future], self.test_y[:-(datalag - 1)]

    def summary(self):
        """
        Dataset Summary of its characteristics

        :return:
        """
        if self.train_x is None:
            raise NameError('Data not loaded yet')
        else:
            print("--- Dataset Configuration-----------")
            print(f"Dataset name: {self.config['datanames']}")
            if 'fraction' in self.config:
                print(f"Data fraction: {self.config['fraction']}")
            else:
                print(f"Data fraction: 2")
            print(f"Training:   X={self.train_x.shape} Y={self.train_y.shape}")
            print(f"Validation: X={self.val_x.shape} Y={self.val_y.shape}")
            print(f"Tests:      X={self.test_x.shape} T={self.test_y.shape}")
            if type(self.config['dataset']) == int:
                print(
                    f"Dataset type= {self.dataset_type[self.config['dataset']]}"
                )
            else:
                print(f"Dataset type= {self.config['dataset']}")
            if 'scaler' in self.config:
                print(f"Scaler= {self.config['scaler']}")
            else:
                print(f"Scaler= standard")
            if 'dmatrix' in self.config:
                print(f"Data matrix configuration= {self.config['dmatrix']}")
            print(f"Vars= {self.config['vars']}")
            print(f"Lag= {self.config['lag']}")
            print(f"Ahead= {self.config['ahead']}")
            print("------------------------------------")

    def compute_measures(self, var, window=None):
        """
        Computing some measures with the wind series
        Window is a dictionary with a keyword for the windoe size and a window length

        :return:
        """
        if self.raw_data is None:
            raise NameError("Raw data is not loaded")

        if var > self.raw_data.shape[1]:
            raise NameError("Invalid variable number")

        dvals = {}
        dvals['SpecEnt'] = spectral_entropy(self.raw_data[:, var], sf=1)
        dvals['SampEnt'] = sample_entropy(self.raw_data[:, var], order=2)

        data = self.raw_data[:, var]
        for w in window:
            lw = window[w]
            length = int(data.shape[0] / lw)
            size = lw * length
            datac = data[:size]
            datac = datac.reshape(-1, lw)
            means = np.mean(datac, axis=1)
            vars = np.std(datac, axis=1)
            dvals[f'Stab{w}'] = np.std(means)
            dvals[f'Lump{w}'] = np.std(vars)

        return dvals

コード例 #8

0

ファイルを表示

from sklearn.model_selection import train_test_split

df = pd.read_csv("baddata.txt", delimiter='\s+', header=None)

X = df.iloc[:, :].values
N_SAMPLES = 1000
FONT_SIZE = 6
BINS = 30

rng = np.random.RandomState(304)
bc = PowerTransformer(method='box-cox')
yj = PowerTransformer(method='yeo-johnson')
# n_quantiles is set to the training set size rather than the default value
# to avoid a warning being raised by this example
qt = QuantileTransformer(n_quantiles=500,
                         output_distribution='normal',
                         random_state=rng)
size = (N_SAMPLES, 1)

# lognormal distribution
X_lognormal = rng.lognormal(size=size)

# chi-squared distribution
df = 3
X_chisq = rng.chisquare(df=df, size=size)

# weibull distribution
a = 50
X_weibull = rng.weibull(a=a, size=size)

# gaussian distribution

コード例 #9

0

ファイルを表示

def _make_experiment(category,
                     name,
                     learning_algorithm,
                     learning_params,
                     X,
                     y,
                     outer_folds=7,
                     inner_folds=5,
                     logger=None):
    if category not in ('classification', 'regression'):
        raise ValueError("'category' should be either equal to "
                         "'classification' or 'regression' "
                         f"(found {category})")
    if logger:
        logger.info(f'starting experiment: {name}')

    pipeline_desc = [('scaler', None),
                     ('learning_algorithm', learning_algorithm)]
    pipe = Pipeline(pipeline_desc)

    scalers = [
        StandardScaler(),
        RobustScaler(),
        MinMaxScaler(),
        QuantileTransformer(n_quantiles=50)
    ]

    params = {'scaler': scalers}
    for k in learning_params:
        params['learning_algorithm__' + k] = learning_params[k]

    fold_gen = StratifiedKFold if category == 'classification' else KFold
    outer_fold = fold_gen(n_splits=outer_folds)
    scores = []
    best_models = []
    best_params = []

    progress = tqdm(outer_fold.split(X, y),
                    total=outer_fold.get_n_splits(),
                    desc=name,
                    leave=False)

    for train_idx, test_idx in progress:
        if logger:
            logger.info(f'Outer fold {progress.n}')
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        gs = RandomizedSearchCV(pipe,
                                params,
                                verbose=0,
                                cv=inner_folds,
                                error_score=np.nan,
                                n_jobs=-1,
                                pre_dispatch=10)
        gs = gs.fit(X_train, y_train)

        predictions = gs.predict(X_test)
        perf = _f1_score if category == 'classification' else _rmse
        score = perf(y_test, predictions)
        scores.append(score)

        best_models.append(gs.best_estimator_)
        best_params.append(gs.best_params_)
        if logger:
            logger.info(f'score {score}')
            logger.info(f'best params {gs.best_params_}')

    if logger:
        logger.info('ended experiment.')

        logger.info(f'mean test error {np.mean(scores)}')

    progress.close()
    print(f'{name}: {np.mean(scores):.3f}')
    return scores, best_models, best_params

コード例 #10

0

ファイルを表示

ファイル: get_explore.py プロジェクト: mesutmutlu/mix

    print(len(tfv.vocabulary_))
    #df = pd.DataFrame(data=X.toarray())
    #print(df )

    sys.exit()

    from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, PowerTransformer

    cols = ["Age", "Fee", "PhotoAmt", "VideoAmt", "Quantity"]

    norm = np.random.normal(0, 0.1, 1000)
    from scipy.stats import skewtest, normaltest

    print(normaltest(norm))
    rng = np.random.RandomState(304)
    qt = QuantileTransformer(output_distribution='normal', random_state=rng)
    pt = PowerTransformer(method="yeo-johnson")

    for c in cols:
        f, axes = plt.subplots(2, 2)
        axes[0, 0].hist(train[c], bins='auto')
        axes[0,
             0].set_title(c + " notransform:" + str(normaltest(train[c])[1]))

        qt_t = qt.fit_transform(train[c].values.reshape(-1, 1))
        axes[0, 1].hist(qt_t, bins='auto', label=str(normaltest(qt_t)[1]))
        axes[0, 1].set_title("quantiletransform:" + str(normaltest(qt_t)[1]))

        pt_t = pt.fit_transform(train[c].values.reshape(-1, 1))
        axes[1, 0].hist(pt_t, bins='auto', label=str(normaltest(pt_t)[1]))
        axes[1, 0].set_title("powertransform:" + str(normaltest(pt_t)[1]))

コード例 #11

0

ファイルを表示

def uniform_scaler(train, test, seed=123):
    scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=seed, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled

コード例 #12

0

ファイルを表示

signal_desc_PowerTransformer = []

for i in range(8):
    signal_desc_PowerTransformer += [pd.Series(signal_PowerTransformer[i, :])]

corr_silver_PowerTransformer = pd.DataFrame(signal_PowerTransformer)
corr_silver_PowerTransformer = corr_silver_PowerTransformer.transpose()

desc_corr_silver_PowerTransformer = corr_silver_PowerTransformer.describe()

corr_mat_PowerTransformer = corr_silver_PowerTransformer.corr()

cov_mat_PowerTransformer = corr_silver_PowerTransformer.cov()

######################################################################################################################
signal_QuantileTransformerUniform = QuantileTransformer(
    output_distribution='uniform').fit_transform(signal)
signal_desc_QuantileTransformerUniform = []

for i in range(8):
    signal_desc_QuantileTransformerUniform += [
        pd.Series(signal_QuantileTransformerUniform[i, :])
    ]

corr_silver_QuantileTransformerUniform = pd.DataFrame(
    signal_QuantileTransformerUniform)
corr_silver_QuantileTransformerUniform = corr_silver_QuantileTransformerUniform.transpose(
)

desc_corr_silver_QuantileTransformerUniform = corr_silver_QuantileTransformerUniform.describe(
)

コード例 #13

0

ファイルを表示

ファイル: clustering.py プロジェクト: whuss/dashboard_react

def input_data_clustering(device: str,
                          start_date: date,
                          end_date: Optional[date] = None,
                          n_clusters=5,
                          return_only_cluster=True,
                          return_pca=False) -> pd.DataFrame:
    def add_column_postfix(df: pd.DataFrame, postfix: str) -> pd.DataFrame:
        columns = df.columns
        mapping = {c: f"{c}_{postfix}" for c in columns}
        return df.rename(columns=mapping)

    # get normalized input data
    data = get_input_data(device,
                          start_date,
                          end_date=end_date,
                          normalized=True)
    if data.empty:
        return data

    # compute statistics over rolling window
    rolling = data.rolling('15Min', min_periods=1, win_type=None)
    data_rolling_ = list()
    data_rolling_.append(add_column_postfix(rolling.count(), "count"))
    data_rolling_.append(add_column_postfix(rolling.sum(), "sum"))
    data_rolling_.append(add_column_postfix(rolling.mean(), "mean"))
    data_rolling_.append(add_column_postfix(rolling.median(), "median"))
    data_rolling_.append(add_column_postfix(rolling.var(), "var"))
    data_rolling_.append(add_column_postfix(rolling.kurt(), "kurt"))
    data_rolling_.append(add_column_postfix(rolling.skew(), "skew"))
    data_rolling = pd.concat(data_rolling_, axis=1)
    data_rolling = data_rolling.loc[~data_rolling.index.duplicated(
        keep='first')]
    data_rolling = data_rolling.resample("1Min").nearest(limit=1).dropna(
        how='all')

    from analytics.instruction import get_power
    power_data = get_power(device, start_date)
    power_data_rolling = power_data.rolling('15Min',
                                            min_periods=1,
                                            win_type=None).mean()
    data_rolling = data_rolling.merge(power_data_rolling,
                                      how='left',
                                      left_index=True,
                                      right_index=True)
    data_rolling = data_rolling[data_rolling.power >= 0.95]
    data_rolling = data_rolling.drop(columns='power')

    # normalize rolling data
    st_rolling = QuantileTransformer(output_distribution="normal")
    st_rolling.fit(data_rolling)
    data_rolling_normalized = pd.DataFrame(st_rolling.transform(data_rolling),
                                           columns=data_rolling.columns,
                                           index=data_rolling.index).fillna(0)

    # We do not have enough data for a clustering
    if len(data_rolling_normalized) < n_clusters:
        return pd.DataFrame()

    # perform PCA
    pca = PCA(random_state=31415)
    pca.fit(data_rolling_normalized)

    variance = np.cumsum(pca.explained_variance_ratio_)

    # how many dimensions to keep for variance over 0.95
    n_dims = variance[variance <= 0.95].shape[0] + 1

    data_pca = pca.transform(data_rolling_normalized)[:, :n_dims]

    # Cluster the data into 5 clusters
    k_means = KMeans(n_clusters=n_clusters, random_state=31415)
    clustering = k_means.fit_predict(data_pca)

    if return_pca:
        cluster_df = pd.DataFrame(clustering, columns=['cluster'])
        pca_df = pd.DataFrame(data_pca)
        pca_df.columns = [f"d_{c}" for c in pca_df.columns]
        return pd.concat([cluster_df, pca_df], axis=1)

    data_rolling.loc[:, 'cluster'] = clustering
    if return_only_cluster:
        return data_rolling[['cluster']]
    return data_rolling

コード例 #14

0

ファイルを表示

def perform_uniform_scaler(train, test):
	u_scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train)
	u_train_scaled = pd.DataFrame(u_scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
	u_test_scaled = pd.DataFrame(u_scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
	return u_scaler, u_train_scaled, u_test_scaled

コード例 #15

0

ファイルを表示

ファイル: factory.py プロジェクト: PSE-TECO-2020-TEAM1/e2e-ml_model-management

from app.ml.objects.normalization import Normalization
from sklearn.preprocessing import (MinMaxScaler, Normalizer,
                                   QuantileTransformer, RobustScaler,
                                   StandardScaler)

normalizer_factory_dict = {
    Normalization.MIN_MAX_SCALER: lambda: MinMaxScaler(),
    Normalization.NORMALIZER: lambda: Normalizer(),
    Normalization.QUANTILE_TRANSFORMER: lambda: QuantileTransformer(),
    Normalization.ROBUST_SCALER: lambda: RobustScaler(),
    Normalization.STANDARD_SCALER: lambda: StandardScaler()
}


def get_normalizer(normalization: Normalization):
    return normalizer_factory_dict[normalization]()

コード例 #16

0

ファイルを表示

ファイル: Mercari-6th-A.py プロジェクト: ffedericoni/Mercari

    def train_model(params, seed, model_num):

        if model_num == 0:
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000  #3000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          y[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, y[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = model.predict(X_val)[:, 0]
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          y,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = model.predict(X_test)[:, 0]
            K.clear_session()
            return y_test_pred

        if model_num == 1:
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          ynorm[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, ynorm[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = model.predict(X_val)[:, 0] * std + mean
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          ynorm,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=1,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = model.predict(X_test)[:, 0] * std + mean
            K.clear_session()
            return y_test_pred

        if model_num == 2:
            normll = QuantileTransformer(output_distribution='normal')
            ynorm2 = normll.fit_transform(yrel)
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          ynorm2[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, ynorm2[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = (normll.inverse_transform(model.predict(X_val))[:, 0]
                          + 1) * train_data['cat_price'].values[val_idx]
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          ynorm2,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = (
                normll.inverse_transform(model.predict(X_test))[:, 0] +
                1) * test_data['cat_price'].values

            K.clear_session()
            return y_test_pred

        if model_num == 3:
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          y[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, y[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = model.predict(X_val)[:, 0]
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          y,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = model.predict(X_test)[:, 0]
            K.clear_session()
            return y_test_pred

コード例 #17

0

ファイルを表示

ファイル: boston.py プロジェクト: ranchoy/ml

# 画图一
ax0.scatter(y_test, y_pred)
ax0.set_xlabel('True Target')
ax0.set_ylabel('Target predicted')
ax0.plot([0, 10], [0, 10], '--k')
ax0.text(
    1, 9, r'$R^2$=%.2f, MAE=%.2f' %
    (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 10])
ax0.set_ylim([0, 10])

# TransformedTargetRegressor在拟合回归模型之前会变换目标y。模型的预测结果会通过一个逆向变换被重新映射回到原始的空间。
# 该类接受两个参数：一个是用于预测的regressor，另一个是用于变换目标变量的transformer。
regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                        transformer=QuantileTransformer(
                                            n_quantiles=300,
                                            output_distribution='normal'))
regr_trans.fit(X_train, y_train)
y_pred = regr_trans.predict(X_test)

# 画图二
ax1.scatter(y_test, y_pred)
ax1.plot([0, 10], [0, 10], '--k')
ax1.set_xlabel('True Target')
ax1.set_ylabel('Target predicted')
ax1.text(
    1, 9, r'$R^2$=%.2f, MAE=%.2f' %
    (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax1.set_xlim([0, 10])
ax1.set_ylim([0, 10])

コード例 #18

0

ファイルを表示

train_filename = 'NSLKDD/KDDTrain.csv'
test_filename = 'NSLKDD/KDDTest.csv'
model_dir = 'WideDeepModel/NSLKDD/'
train_path = model_dir + 'aug_train.csv'
test_path = model_dir + 'aug_test.csv'
fold = 5
num_epochs = 240
batch_size = 64
dropout = 0.2
label_mapping = {'normal': 0, 'probe': 1, 'dos': 2, 'u2r': 3, 'r2l': 4}
class_weights = {
    'normal': 0.15,
    'probe': 0.2,
    'dos': 0.15,
    'u2r': 0.3,
    'r2l': 0.2
}
transformer = QuantileTransformer()
transformer_fitted = False
scaler = MinMaxScaler()
scaler_fitted = False

columns = process_dataset(train_filename, train_path, split=True)
process_dataset(test_filename, test_path, split=False)
hist = train_and_eval(model_dir, columns, train_path, test_path)
plot_history(hist['train_loss'], hist['valid_loss'], hist['test_loss'],
             model_dir)
output = open(model_dir + 'Runs%d.pkl' % (num_epochs), 'wb')
pickle.dump(hist, output)
output.close()

コード例 #19

0

ファイルを表示

def deal_with_scaling(data, model, y_name):
    """
    Fits a scaler and transform data.
    :param data: pandas DataFrame
    :param model: regression model to be used
    :param scaler: scaler for numerical data to be used
    :param y_name: name of your target variable
    :return: transformed data.
    """
    data = data.copy()

    if sum(data.isna().sum()) > 0:
        print('Unable to check best scaler for data. You have NaNs in there!')
        return None, None, None, None

    scalers = {
        'row-wise': [
            PowerTransformer(method='yeo-johnson'),
            PowerTransformer(method='box-cox'),
            StandardScaler(),
            MinMaxScaler(),
            RobustScaler(),
            FunctionTransformer(np.log1p, validate=True)
        ],
        'col-wise':
        [QuantileTransformer(output_distribution='normal'),
         Normalizer()]
    }

    max_score = regression_benchmark(data, model, y_name)
    final_scaler = None
    final_model = model
    X = pd.get_dummies(data.drop(y_name, axis=1))
    y = data[y_name]
    X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
        X, y, test_size=0.25, random_state=42)

    print('Testing different scalers. This might take a while.')

    for scaler in scalers['row-wise']:

        X = pd.get_dummies(data.drop(y_name, axis=1))
        y = data[y_name]

        X_train, X_test, y_train_, y_test_ = train_test_split(X,
                                                              y,
                                                              test_size=0.25,
                                                              random_state=42)

        try:
            scaler.fit(X_train)
            X_train_, X_test_ = (scaler.transform(X_train),
                                 scaler.transform(X_test))

            model.fit(X_train_, y_train_)
            score = model.score(X_test_, y_test_)

        except:
            print(f'An error ocurred while scaling with {scaler}.')
            continue

        if score > max_score:
            max_score = score
            final_scaler = scaler
            final_model = model
            X_train_final, X_test_final, y_train_final, y_test_final = X_train_, X_test_, y_train_, y_test_

    print('Almost there...')
    for scaler in scalers['col-wise']:

        X = data.drop(y_name, axis=1)
        y = data[y_name]

        try:
            X_num = scaler.fit_transform(X.select_dtypes(np.number))

            X_cat = pd.get_dummies(X.select_dtypes(exclude=np.number))

            X_ = np.concatenate((X_num, X_cat), axis=1)

            X_train_, X_test_, y_train_, y_test_ = train_test_split(
                X_, y, test_size=0.25, random_state=42)

            model.fit(X_train_, y_train_)
            score = model.score(X_test_, y_test_)

        except:
            print(f'An error ocurred while scaling with {scaler}.')
            continue

        if score > max_score:
            max_score = score
            final_scaler = scaler
            final_model = model
            X_train_final, X_test_final, y_train_final, y_test_final = X_train_, X_test_, y_train_, y_test_

    with open('final_scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)

    #if max_score == regression_benchmark(data, model, y_name):
    #   final_model =
    #  max_score =
    print(
        f'The scaler chosen was {scaler}, with an r-squared of {max_score}.\nSaving scaler to "final_scaler.pkl".\n'
    )

    return X_train_final, X_test_final, y_train_final, y_test_final, final_model, max_score, final_scaler

コード例 #20

0

ファイルを表示

ファイル: test_common.py プロジェクト: xieliaing/scikit-learn

def _get_valid_samples_by_column(X, col):
    """Get non NaN samples in column of X"""
    return X[:, [col]][~np.isnan(X[:, col])]


@pytest.mark.parametrize(
    "est, func, support_sparse, strictly_positive, omit_kwargs",
    [
        (MaxAbsScaler(), maxabs_scale, True, False, []),
        (MinMaxScaler(), minmax_scale, False, False, ["clip"]),
        (StandardScaler(), scale, False, False, []),
        (StandardScaler(with_mean=False), scale, True, False, []),
        (PowerTransformer("yeo-johnson"), power_transform, False, False, []),
        (PowerTransformer("box-cox"), power_transform, False, True, []),
        (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []),
        (RobustScaler(), robust_scale, False, False, []),
        (RobustScaler(with_centering=False), robust_scale, True, False, []),
    ],
)
def test_missing_value_handling(
    est, func, support_sparse, strictly_positive, omit_kwargs
):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[
        rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)
    ] = np.nan
    if strictly_positive:

コード例 #21

0

ファイルを表示

def get_scaler(n_quantiles):
    if n_quantiles > 0:
        return QuantileTransformer(n_quantiles=n_quantiles, output_distribution='normal', subsample=int(1e10))
    else:
        return StandardScaler()

コード例 #22

0

ファイルを表示

 def get_new_base_enc():
     return QuantileTransformer()

コード例 #23

0

ファイルを表示

ファイル: cveval_regression_seqprior.py プロジェクト: ysuter/brats20-survivalprediction

                         "Parameter1", "Parameter2", "Accuracy",
                         "Balanced Accuracy", "MSE", "r2", "spearmanr"
                     ])

for split in np.arange(numsplits):
    print("Evaluating fold " + str(split))
    train_index = kfolds["fold_" + str(split)]["train"]
    test_index = kfolds["fold_" + str(split)]["test"]

    X_train, X_test = features_nosurv.iloc[train_index], features_nosurv.iloc[
        test_index]
    y_train, y_test = surv_days[train_index], surv_days[test_index]

    # scale target with a quantile transform
    qtfm = QuantileTransformer(output_distribution='uniform',
                               n_quantiles=150,
                               random_state=randomstate)
    y_train = np.squeeze(qtfm.fit_transform(y_train.values.reshape(-1, 1)))
    y_test = np.squeeze(qtfm.transform(y_test.values.reshape(-1, 1)))
    # y_train, y_test = surv_classes[train_index], surv_classes[test_index]

    # for every split, perform feature selection
    for sel_name, sel in zip(selectornames_short, selectors):
        print('#####')
        print(sel_name)
        print('#####')

        if sel_name is "CHSQ":
            # shift X values to be non-negative for chsq feature selection
            X_train_tmp = X_train + np.abs(X_train.min())
            selscore = sel(X_train_tmp, y_train)

コード例 #24

0

ファイルを表示

ファイル: utilities.py プロジェクト: smkia/DNM

def prepare_data(control_fmri_data, control_phenotype_data, SCHZ_fmri_data, SCHZ_phenotype_data, \
                 ADHD_fmri_data, ADHD_phenotype_data, BIPL_fmri_data, BIPL_phenotype_data, train_num, factor=5, sampling='bootstrap'):
    CTRL_num = control_phenotype_data.shape[0]
    SCHZ_num = SCHZ_phenotype_data.shape[0]
    ADHD_num = ADHD_phenotype_data.shape[0]
    BIPL_num = BIPL_phenotype_data.shape[0]
    x_context = torch.zeros([train_num+15, factor, control_phenotype_data.shape[1]])
    y_context = torch.zeros([train_num+15, factor, control_fmri_data.shape[1], control_fmri_data.shape[2], control_fmri_data.shape[3]])
    x_all = torch.zeros([train_num+15, factor, control_phenotype_data.shape[1]])
    y_all = torch.zeros([train_num+15, factor, control_fmri_data.shape[1], control_fmri_data.shape[2], control_fmri_data.shape[3]])
    
    rand_idx = np.random.permutation(CTRL_num)
    train_idx_ctrl = rand_idx[0:train_num]
    test_idx_ctrl = np.setdiff1d(np.array(range(CTRL_num)),train_idx_ctrl)
    rand_idx = np.random.permutation(SCHZ_num)
    train_idx_SCHZ = rand_idx[0:5]
    test_idx_SCHZ = np.setdiff1d(np.array(range(SCHZ_num)),train_idx_SCHZ)
    rand_idx = np.random.permutation(ADHD_num)
    train_idx_ADHD = rand_idx[0:5]
    test_idx_ADHD = np.setdiff1d(np.array(range(ADHD_num)),train_idx_ADHD)
    rand_idx = np.random.permutation(BIPL_num)
    train_idx_BIPL = rand_idx[0:5]
    test_idx_BIPL = np.setdiff1d(np.array(range(BIPL_num)),train_idx_BIPL)

    x_context_train = torch.cat((control_phenotype_data[train_idx_ctrl,:],
                                      SCHZ_phenotype_data[train_idx_SCHZ,:], ADHD_phenotype_data[train_idx_ADHD,:], BIPL_phenotype_data[train_idx_BIPL,:]))
    means = x_context_train.mean(dim = 0, keepdim = True)
    stds = x_context_train.std(dim = 0, keepdim = True)
    x_context_train = (x_context_train - means) / stds
    x_context_train[x_context_train != x_context_train] = 0
    x_context_train[x_context_train == float("-Inf")] = 0
    x_context_train[x_context_train == float("Inf")] = 0
    
    x_context_test = torch.cat((control_phenotype_data[test_idx_ctrl,:], 
                                SCHZ_phenotype_data[test_idx_SCHZ,:], ADHD_phenotype_data[test_idx_ADHD,:], BIPL_phenotype_data[test_idx_BIPL,:]),0)
    x_context_test = (x_context_test - means) / stds
    x_context_test[x_context_test != x_context_test] = 0
    x_context_test[x_context_test == float("-Inf")] = 0
    x_context_test[x_context_test == float("Inf")] = 0
    
    x_test = x_context_test
    x_context_test = x_context_test.unsqueeze(1).expand(-1,factor,-1)
    
    y_context_train = torch.cat((control_fmri_data[train_idx_ctrl,:,:,:],
                                 SCHZ_fmri_data[train_idx_SCHZ,:,:,:], ADHD_fmri_data[train_idx_ADHD,:,:,:], BIPL_fmri_data[train_idx_BIPL,:,:,:]),0)
    y_test = torch.cat((control_fmri_data[test_idx_ctrl,:,:,:], SCHZ_fmri_data[test_idx_SCHZ,:,:,:], 
                        ADHD_fmri_data[test_idx_ADHD,:,:,:], BIPL_fmri_data[test_idx_BIPL,:,:,:]),0)
    y_context_test = torch.zeros([y_test.shape[0], factor, y_test.shape[1], y_test.shape[2], y_test.shape[3]])
    
    scaler = QuantileTransformer()
    scaler.fit(ravel_2D(np.concatenate((control_fmri_data, SCHZ_fmri_data, ADHD_fmri_data, BIPL_fmri_data),0)))
    
    for i in range(factor):
        if sampling == 'noise':
            x_context[:,i,:] = x_context_train + torch.randn(x_context_train.shape) * 0.01
            x_context_test[:,i,:] = x_context_test[:,i,:] + torch.randn([x_context_test.shape[0],x_context_test.shape[2]]) * 0.01
        elif sampling == 'bootstrap':
            x_context[:,i,:] = x_context_train[:,:]
        idx = np.random.randint(0,x_context_train.shape[0], x_context_train.shape[0])
        for j in range(y_context_train.shape[1]):
            for k in range(y_context_train.shape[2]):
                for l in range(y_context_train.shape[3]):
                    reg = LinearRegression()
                    if sampling == 'noise':
                        reg.fit(x_context[:,i,:].numpy(),y_context_train[:,j,k,l].numpy())
                    elif sampling == 'bootstrap':
                        reg.fit(x_context[idx,i,:].numpy(),y_context_train[idx,j,k,l].numpy())
                        
                    y_context[:,i,j,k,l] = torch.tensor(reg.predict(x_context[:,i,:].numpy()))    
                    y_context_test[:,i,j,k,l] = torch.tensor(reg.predict(x_context_test[:,i,:].numpy()))
        y_context[:,i,:,:,:] = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context[:,i,:,:,:])),y_context[:,i,:,:,:].shape))
        y_context_test[:,i,:,:,:] = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context_test[:,i,:,:,:])),y_context_test[:,i,:,:,:].shape))
        print(i)
    x_all = x_context_train.unsqueeze(1).expand(-1,factor,-1)
    y_all = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context_train)),y_context_train.shape),dtype=torch.float32).unsqueeze(1).expand(-1,factor,-1,-1,-1)
    y_test = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_test)),y_test.shape),dtype=torch.float32)
    y_test = y_test.view((y_test.shape[0],1,y_test.shape[1],y_test.shape[2],y_test.shape[3]))
   
    labels = np.zeros(y_test.shape[0])
    labels[len(test_idx_ctrl):] = 1
    diagnosis_labels = np.zeros(y_test.shape[0])
    diagnosis_labels[len(test_idx_ctrl):len(test_idx_ctrl)+len(test_idx_SCHZ)] = 1
    diagnosis_labels[len(test_idx_ctrl)+len(test_idx_SCHZ):len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD)] = 2
    diagnosis_labels[len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD):len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD)+len(test_idx_BIPL)] = 3
    return x_context, y_context, x_all, y_all, x_context_test, y_context_test, x_test, y_test, labels, diagnosis_labels, scaler

コード例 #25

0

ファイルを表示

def pandas_group_quantile_transform(x):
    "Used inside the transform function after a pandas groupby operation"
    qt = QuantileTransformer()
    return qt.fit_transform(x.values.reshape(-1, 1)).reshape(-1)

コード例 #26

0

ファイルを表示

class PowerForecaster:
    """
    Check out the class spec at
    https://docs.google.com/document/d/1-ceuHfJ2bNbgmKddLTUCS0HJ1juE5t0042Mts_yEUD8v
    sample data is in
    https://drive.google.com/uc?export=download&id=1z2MBYJ8k4M5J3udlFVc2d8opE_f-S4BK
    """

    def __init__(self, df, model=Models.PROPHET,
                 upsample_freq=None,
                 train_test_split_ratio=Constants.TRAIN_TEST_SPLIT_RATIO.value,
                 epochs=Constants.EPOCHS.value,
                 initial_epoch=Constants.INITIAL_EPOCH.value,
                 batch_size=Constants.BATCH_SIZE.value,
                 sliding_window_size_or_time_steps=Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value,
                 do_shuffle=True):
        logging.info("resample: {}. future_prediction: {}, epochs: {}, batch_size: {},"
                     " window_size: {}, eurons: {}"
                     .format(Constants.RESAMPLING_FREQ.value
                             , Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value
                             , epochs
                             , batch_size
                             , sliding_window_size_or_time_steps
                             , Constants.NEURONS.value
                             ))
        if logging.getLogger().isEnabledFor(logging.INFO):
            explore_data(df)
        # first step is to create a timestamp column as index to turn it to a TimeSeries data
        df.index = pd.to_datetime(df[ColumnNames.DATE.value] + df[ColumnNames.TIME.value],
                                  format='%Y-%m-%d%H:%M:%S', errors='raise')
        if 'Unnamed: 0' in df.columns:
            df.drop('Unnamed: 0', axis=1, inplace=True)

        # keep a copy of original dataset for future comparison
        self.df_original = df.copy()

        # we interpolate temperature using prophet to use it in a multivariate forecast
        temperature = ColumnNames.TEMPERATURE.value
        interpolated_df = facebook_prophet_filter(df, temperature,
                                                  Constants.FORECASTED_TEMPERATURE_FILE.value)
        interpolated_df.index = df.index
        df[[temperature]] = interpolated_df[[ColumnNames.FORECAST.value]]

        # lets also interpolate missing kwh using facebook prophet (or we could simply drop them)

        # now turn to kwh and make the format compatible with prophet
        power = ColumnNames.POWER.value
        interpolated_df = facebook_prophet_filter(df, power,
                                                  Constants.FORECASTED_POWER_FILE.value)
        interpolated_df.index = df.index
        df[[power]] = interpolated_df[[ColumnNames.FORECAST.value]]

        df = df.rename(columns={power: ColumnNames.LABEL.value})
        df.drop(columns=[ColumnNames.DATE.value,
                         ColumnNames.TIME.value,
                         ColumnNames.DAY_OF_WEEK.value,
                         ColumnNames.MONTH.value],
                inplace=True
                )
        if upsample_freq is not None:
            df = df.resample(upsample_freq).mean()

        # for any regression or forecasting it is better to work with normalized data
        self.transformer = QuantileTransformer()  # handle outliers better than MinMaxScalar
        features = ColumnNames.FEATURES.value
        normalized = normalize(df, features, transformer=self.transformer)

        # we use the last part (after 12/1/2013) that doesnt have temperature for testing
        cutoff_date = Constants.CUTOFF_DATE.value
        self.df = normalized[normalized.index < cutoff_date]
        self.testing = normalized[normalized.index >= cutoff_date]

        self.df[ColumnNames.DATE_STAMP.value] = self.df.index
        self.df_blocked = None
        self.train_test_split_ratio = train_test_split_ratio
        self.model_type = model
        self.train_X, self.test_X, self.train_test_split_index = self.train_test_split(self.df[features])
        self.train_y, self.test_y, _ = self.train_test_split(self.df[ColumnNames.LABELS.value])
        self.model_fit = None
        self.epochs = epochs
        self.initial_epoch = initial_epoch
        self.batch_size = batch_size
        self.history = None
        # following is defines in sliding_window
        self.do_shuffle = do_shuffle
        self.val_idx = None
        self.shuffled_X = None
        self.shuffled_y = None
        self.train = None
        self.label = None
        self.train_size = None
        self.val_size = None

        if logging.getLogger().isEnabledFor(logging.INFO):
            explore_data(self.df)

    def train_test_split(self, df):
        split_index = int(self.train_test_split_ratio * df.shape[0])
        train = df.iloc[:split_index, :]
        test = df.iloc[split_index:, :]
        return train, test, split_index

    def stationary_test(self):
        dataset = self.test_y.dropna()
        seasonal_dataset = sm.tsa.seasonal_decompose(dataset, freq=365)
        fig = seasonal_dataset.plot()
        fig.set_figheight(8)
        fig.set_figwidth(15)
        fig.show()

        def p_value(dataset):
            # ADF-test(Original-time-series)
            dataset.dropna()
            p_value = sm.tsa.adfuller(dataset, regression='ct')
            logging.debug('p-value:{}'.format(p_value))
            p_value = sm.tsa.adfuller(dataset, regression='c')
            logging.debug('p-value:{}'.format(p_value))

        p_value(self.train_y)
        p_value(self.test_y)

        # Test works for only 12 variables, check the eigenvalues
        johnsen_test = coint_johansen(self.df[ColumnNames.FEATURES.value].dropna(), -1, 1).eig
        return johnsen_test

    def seasonal_prediction(self):
        from statsmodels.tsa.api import SimpleExpSmoothing
        y_hat_avg = self.test_y.copy()
        fit2 = SimpleExpSmoothing(np.asarray(self.train_y['Count'])).fit(smoothing_level=0.6, optimized=False)
        y_hat_avg['SES'] = fit2.forecast(len(self.test_y))
        plt.figure(figsize=(16, 8))
        plt.plot(self.train_y['Count'], label='Train')
        plt.plot(self.test_y['Count'], label='Test')
        plt.plot(y_hat_avg['SES'], label='SES')
        plt.legend(loc='best')
        plt.show()

    def fit(self):
        if self.model_type == Models.PROPHET:
            self.prophet_fit()
        elif self.model_type == Models.ARIMA:
            self.arima_fit()
        elif self.model_type == Models.VAR:
            self.var_fit()
        elif self.model_type == Models.LSTM:
            self.lstm_fit()
        else:
            raise ValueError("{} is not defined".format(self.model_type))

    def evaluate(self):
        self.loss_metrics = self.model_type.value.evaluate(
            self.val_X,
            self.val_y,
            batch_size=self.batch_size,
            verbose=0
        )

        logging.info("Metric names:{}".format(self.model_type.value.metrics_names))
        logging.info("Loss Metrics:{}".format(self.loss_metrics))

    def resultToDataFrame(self, data, start_index, end_index, do_scale_back=False):
        label_column = ColumnNames.LABEL.value
        df = self.df.iloc[start_index:end_index]
        df[label_column] = data
        if do_scale_back:
            features = ColumnNames.FEATURES.value
            df[features] = self.transformer.inverse_transform(df[features])
        return df[[label_column]]


    def block_after_date(self, start_block_date_st):
        index, _ = find_index(self.df, start_block_date_st)
        logging.debug("Index of block is {} with length of {}".format(index, len(self.df) - index))
        self.df_blocked = self.df.iloc[index:]
        self.df_blocked.reindex()
        logging.info("Blocked from {} to {} fromo training and validation"
                     .format(self.df_blocked.index[0], self.df_blocked.index[-1]))


    def adjust_index_and_training_shift(self, start_date_in_labeling_st
                                        , training_duration_in_frequency = None
                                        , start_date_training_st = None
                                        ):
        logging.debug("Original range data of data: [{}-{}]".format(self.df.index[0], self.df.index[-1]))
        index_start_labeling, _ = find_index(self.df, start_date_in_labeling_st)
        if start_date_training_st is not None:
            index_start_training, _ = find_index(self.df, start_date_training_st)
            if index_start_labeling < index_start_training:
                raise ValueError("Labeling should be after training")
            self.shift = index_start_labeling - index_start_training
        else:
            index_start_training = 0
            self.shift = index_start_labeling

        if training_duration_in_frequency is None:
            logging.info("Shift is set to be {}".format(self.shift))
        else:
            final_index = index_start_training + training_duration_in_frequency + self.shift
            logging.debug("start index: {}, final_index: {}".format(index_start_training, final_index))
            self.df = self.df.iloc[index_start_training:index_start_training + training_duration_in_frequency + self.shift]

            logging.info("Shift is set to be {}, we picked the slice of [{} : {}] for trainig".format(
                self.shift, self.df.index[0]
                , self.df.index[-1]
            ))

    def lstm_predict(self, model
                     , start_date_to_predict_st=None
                     , duration_in_freq = None
                     , do_scale_back = False
                     ):
        X, true_y = self.get_whole()

        if start_date_to_predict_st is not None:
            y_index_i, _ = find_index(self.df, start_date_to_predict_st)
            x_index_i = 0 if y_index_i <= self.shift else y_index_i - self.shift
            x_index_f = x_index_i + duration_in_freq
            y_index_f = y_index_i + duration_in_freq

            logging.info("Predicting time slice [{} : {}] from [{} : {}]".format(
                self.df.index[y_index_i],self.df.index[y_index_f]
                , self.df.index[x_index_i], self.df.index[x_index_f]
            ))

            X = X[x_index_i:x_index_f]
            true_y = true_y[y_index_i:y_index_f]

        predicted = model.predict(X)
        logging.debug("Predicted Labels shape: {}".format(predicted.shape))

        plt.plot(predicted, 'r')
        plt.plot(true_y, 'b')
        plt.show()
        df_predicted = self.resultToDataFrame(predicted, x_index_i + self.shift
                                              , x_index_f + self.shift, do_scale_back)
        return df_predicted

    def scale_back(self, df_predicted, start_index, end_index):
        label_column = ColumnNames.LABEL.value
        features = ColumnNames.FEATURES.value
        df = self.df[features].iloc[start_index:end_index]
        df[label_column] = df_predicted[label_column]
        scaled_predicted = self.transformer.inverse_transform(df[features])
        df[features] = scaled_predicted
        return df

    def prophet_fit(self):
        past = self.train_y.copy()
        past[ColumnNames.DATE_STAMP.value] = self.train_y.index
        self.model_type.value.fit(past)

    def arima_fit(self):
        model = sm.tsa.statespace.SARIMAX(self.train_y,
                                          order=Constants.SARIMAX_ORDER.value,
                                          seasonal_order=Constants.SARIMAX_SEASONAL_ORDER.value)
        # ,enforce_stationarity=False, enforce_invertibility=False, freq='15T')
        logging.debug("SARIMAX fitting ....")
        self.model_fit = self.model_type.value.fit()
        self.model_fit.summary()
        logging.debug("SARIMAX forecast", self.model_fit.forecast())

    def var_fit(self):
        logging.debug("making VAR model")
        model = VAR(endog=self.train_X[ColumnNames.FEATURES.value].dropna())
        logging.debug("VAR fitting ....")
        self.model_fit = model.fit()
        print(self.model_fit.summary())

    def lstm_fit(self):
        if logging.getLogger().isEnabledFor(logging.INFO):
            print(self.model_type.value.summary())

        callbacks = Callbacks(Constants.MODEL_NAME.value, self.batch_size, self.epochs)
        X, y = self.get_shuff_train_label()

        self.history = self.model_type.value.fit(
            X,
            y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            validation_split=0.35,
            verbose=0,
            callbacks=callbacks.getDefaultCallbacks(),
            initial_epoch=self.initial_epoch,

        )
        logging.debug("history of performance:{}".format(self.history.history))

    def predict(self, feature_set=None):
        future = feature_set if feature_set is not None \
            else Constants.DEFAULT_FUTURE_PERIODS.value
        if self.model_type == Models.PROPHET:
            self.future = self.model_type.value.make_future_dataframe(periods=future,
                                                                      freq=Constants.DEFAULT_FUTURE_FREQ.value,
                                                                      include_history=False)

        if self.model_type == Models.PROPHET:
            predicted = self.model_type.value.predict(self.future)
            predicted[ColumnNames.LABEL.value] = predicted[ColumnNames.FORECAST.value]
        elif self.model_type == Models.ARIMA:
            predicted = self.arima_predict(future)
        elif self.model_type == Models.VAR:
            predicted = self.var_predict(future)
        elif self.model_type == Models.LSTM:
            return self.lstm_predict(self.model.value,
                                     start_date_to_predict_st="2013-6-01",
                                                    duration_in_freq=3 * 30)
        else:
            raise ValueError("{} is not defined".format(self.model_type))
        df_predicted = self.resultToDataFrame(predicted, self.train_test_split_index
                                              , self.train_test_split_index + len(predicted))

        return df_predicted

    def arima_predict(self, future):
        end = str(self.train_y.index[-1])
        start = str(self.train_y.index[-future])
        print(start, end)
        predicted = self.model_fit.predict(start=start[:10], end=end[:10], dynamic=True)
        return predicted

    def var_predict(self, future):
        predicted_array = self.model_fit.forecast(self.model_fit.y, future)
        predicted = pd.DataFrame(predicted_array)
        predicted.columns = ColumnNames.FEATURES.value
        predicted.index = self.test_y.index[:len(predicted)]
        return predicted

    def sliding_window(self):
        # Generate the data matrix
        length0 = self.df.shape[0]
        window_size = Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value
        future_time_steps = Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value
        features_column = ColumnNames.FEATURES.value
        label_column = ColumnNames.LABEL.value

        sliding_window_feature = np.zeros((length0 - window_size - future_time_steps,
                                           window_size, len(features_column)))
        sliding_window_label = np.zeros((length0 - window_size - future_time_steps, 1))

        for counter in range(length0 - window_size - future_time_steps):
            sliding_window_label[counter, :] = self.df[label_column][counter + window_size + future_time_steps]

        for counter in range(length0 - window_size - future_time_steps):
            sliding_window_feature[counter, :] = self.df[features_column][
                                                 counter: counter + window_size]
        if self.do_shuffle:
            logging.debug('Random shuffeling')
        length = sliding_window_feature.shape[0]
        if self.df_blocked is not None:
            length -= len(self.df_blocked)
            logging.info("length of data reduced by {} due to blocking. The last date is {}"
                         .format(len(self.df_blocked), self.df.index[length]))

        logging.debug("sliding window length: {}".format(length))

        split_ratio = Constants.TRAIN_TEST_SPLIT_RATIO.value
        idx = np.random.choice(length, length, replace=False) if self.do_shuffle else np.arange(length)
        self.val_idx = idx[int(split_ratio * length):]

        feature_window_shuffled = sliding_window_feature[idx, :]
        label_window_shuffled = sliding_window_label[idx, :]

        self.shuffled_X = feature_window_shuffled
        self.shuffled_y = label_window_shuffled
        self.train = sliding_window_feature
        self.label = sliding_window_label

        self.train_X = self.shuffled_X[:int(split_ratio * length), :]
        self.train_y = self.shuffled_y[:int(split_ratio * length), :]
        self.train_size = int(split_ratio * length)

        self.val_X = self.shuffled_X[int(split_ratio * length):, :]
        self.val_y = self.shuffled_y[int(split_ratio * length):, :]
        self.val_size = length - self.train_size

    def get_shuff_train_label(self):
        X = self.shuffled_X  # np.expand_dims(self.shuffled_X, axis=-1)
        Y = self.shuffled_y
        return X, Y

    def evaluate_performance(self):
        # make a prediction
        X = self.test_X  # np.expand_dims(self.test_X, axis=-1)
        yhat = self.model_type.value.predict(X)
        test_X = self.test_X.reshape((self.test_X.shape[0], self.test_X.shape[2]))
        # invert scaling for forecast
        inv_yhat = pd.concatenate((yhat, test_X[:, 1:]), axis=1)
        inv_yhat = self.transformer.inverse_transform(inv_yhat)
        inv_yhat = inv_yhat[:, 0]
        # invert scaling for actual
        test_y = self.test_y.reshape((len(self.test_y), 1))
        inv_y = pd.concatenate((test_y, test_X[:, 1:]), axis=1)
        inv_y = self.transformer.inverse_transform(inv_y)
        inv_y = inv_y[:, 0]
        # calculate RMSE
        rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
        logging.debug('Test RMSE: %.3f' % rmse)

    def plot_future(self, predicted):
        self.model_type.value.plot(predicted, xlabel='Date', ylabel='KWH')
        self.model_type.value.plot_components(predicted)

    #        by_dow.plot(xticks=ticks, style=style, title='Averaged on Days of the Week')
    #        plt.show()

    def visual_inspection(self):
        style = [':', '--', '-']
        pd.plotting.register_matplotlib_converters()
        df = self.df

        self.df_original[ColumnNames.ORIGINAL_FEATURES.value].plot(style=style, title='Original Data')
        plt.show()

        self.df[ColumnNames.FEATURES.value].plot(style=style, title='Normalized Data')
        plt.show()

        sampled = df.resample('M').sum()[ColumnNames.FEATURES.value]
        sampled.plot(style=style, title='Aggregated Monthly')
        plt.show()

        sampled = df.resample('W').sum()[ColumnNames.FEATURES.value]
        sampled.plot(style=style, title='Aggregated Weekly')
        plt.show()

        sampled = df.resample('D').sum()[ColumnNames.FEATURES.value]
        sampled.rolling(30, center=True).sum().plot(style=style, title='Aggregated Daily')
        plt.show()

        by_time = df.groupby(by=df.index.time).mean()[ColumnNames.FEATURES.value]
        ticks = 4 * 60 * 60 * np.arange(6)
        by_time.plot(xticks=ticks, style=style, title='Averaged Hourly')
        plt.show()

        days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

        def tick(x):
            if x % 24 == 12:
                return days[int(x) // 24]
            else:
                return ""

        #        ax.xaxis.set_major_formatter(NullFormatter())
        #        ax.xaxis.set_minor_formatter(FuncFormatter(tick))
        #        ax.tick_params(which="major", axis="x", length=10, width=1.5)

        #by_dow = df.groupby(by=df.dow).mean()[ColumnNames.FEATURES.value]
        #ticks = 4 * 60 * 60 * np.arange(6)

    def plot_prediction(self, start_index, end_index):
        style = [':', '--', '-']
        pd.plotting.register_matplotlib_converters()
        label_column = ColumnNames.LABELS.value
        # import pdb; pdb.set_trace()

        t = self.train.index.iloc[start_index:end_index]
        X = self.train.iloc[start_index: end_index]
        true_y = self.label.iloc[start_index, end_index]
        y = self.model_type.value.predict(X)

        plt.plot(t, y, true_y, style=style)
        plt.show()

    def plot_history(self):
        plt.plot(np.arange(self.epochs - self.initial_epoch),
                 self.history.history['loss'], label='train')
        plt.plot(np.arange(self.epochs - self.initial_epoch),
                 self.history.history['val_loss'], label='validation')
        plt.legend()
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

    def get_next_train_batch(self):
        # getting the next train batch
        if self.pointer + self.batchsize >= self.train_size:
            end = self.train_size
            start = self.pointer
            self.pointer = 0
            self.epoch += 1
        else:
            end = self.pointer + self.batchsize
            start = self.pointer
            self.pointer += self.batchsize
        X = self.train_data[start:end, :]
        Y = self.train_label[start:end, :]
        return X, Y

    def get_val(self):
        X = np.expand_dims(self.val_data, axis=-1)

        return X, self.val_label[:]

    def get_whole(self):
        # get whole, for validation set
        X = self.train[:, :]  # np.expand_dims(self.train[:, :], axis=-1)
        Y = self.label[:, :]
        return X, Y

    def reset(self):
        self.pointer = 0
        self.epoch = 0

コード例 #27

0

ファイルを表示

ファイル: neuralnetwork.py プロジェクト: TeddyGlass/ML_Classification_Pipeline

class NNClassifier:
    '''
    Usage:
    clf = NNClassifier(**params)
    history = clf.fit(
    X_train,
    y_train,
    X_valid,
    y_valid,
    early_stopping_rounds
    )
    '''
    
    def __init__(self, input_shape=1024, input_dropout=0.2, hidden_layers=1, hidden_units=64, hidden_dropout=0.2,
                 batch_norm="none", learning_rate=0.05, batch_size=64, epochs=10000):
        self.input_shape = int(input_shape) # layer param
        self.input_dropout = input_dropout # layer param
        self.hidden_layers = int(hidden_layers) # layer param
        self.hidden_units = int(hidden_units) # layer param
        self.hidden_dropout = hidden_dropout # layer param
        self.batch_norm = batch_norm # layer param
        self.learning_rate = learning_rate # optimizer param
        self.batch_size = int(batch_size) # fit param
        self.epochs = int(epochs) # fit param
        
    def fit(self, X_train, y_train, X_valid, y_valid, early_stopping_rounds):
        # Data standardization
        self.transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
        X_train = self.transformer.fit_transform(X_train)
        X_valid = self.transformer.transform(X_valid)
        # layers
        self.model = Sequential()
        self.model.add(Dropout(self.input_dropout, input_shape=(self.input_shape,)))
        for i in range(self.hidden_layers):
            self.model.add(Dense(self.hidden_units))
            if self.batch_norm == 'before_act':
                self. model.add(BatchNormalization())
            self.model.add(ReLU())
            self.model.add(Dropout(self.hidden_dropout))
        self.model.add(Dense(1, activation='sigmoid'))
        # Optimazer
        optimizer = Adam(lr=self.learning_rate, beta_1=0.9, beta_2=0.999, decay=0.)
        # Compile
        self.model.compile(
            loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy']
        )
        # train
        early_stopping = EarlyStopping(patience=early_stopping_rounds, restore_best_weights=True)
        self.history = self.model.fit(
            X_train,
            y_train,
            epochs=self.epochs,
            batch_size=self.batch_size,
            verbose=1,
            validation_data=(X_valid, y_valid),
            callbacks=[early_stopping]
        )
        return self.history
    
    def predict(self, x):
        x = self.transformer.transform(x)
        y_pred = self.model.predict(x).astype("float64")
        y_pred = y_pred.flatten()
        return y_pred

    def get_model(self):
        return self.model

コード例 #28

0

ファイルを表示

class CutOff(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        X[X > 3] = 3
        X[X < -3] = -3
        return X


# Preprocessing for numerical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scale', RobustScaler(quantile_range=[5, 95])),
    ('quantile',
     QuantileTransformer(
         n_quantiles=300, output_distribution='normal', random_state=0)),
    ('cutoff', CutOff()),  # Cut off at 3 standard deviations
    ('norm', Normalizer(norm='l2'))
])

# Preprocessing for nominal categorical data
cat_transformer_nominal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('pca', PCA(whiten=True, random_state=0)),
    ('bins',
     KBinsDiscretizer(n_bins=100, encode='onehot', strategy='quantile')),
    ('norm', Normalizer(norm='l2')),
])

# Preprocessing for ordinal categorical data
cat_transformer_ordinal = Pipeline(steps=[

コード例 #29

0

ファイルを表示

ファイル: Common.py プロジェクト: Firence/WaterDetect-master

    def rgb_burn_in(red, green, blue, burn_in_array, color=None, min_value=None, max_value=None, colormap='viridis',
                    fade=1, uniform_distribution=False, no_data_value=-9999, valid_value=1, transp=0.0):
        """
        Burn in a mask or a specific parameter into an RGB image for visualization purposes.
        The burn_in_array will be copied where values are different from no_data_value.
        :param uniform_distribution: convert the input values in a uniform histogram
        :param colormap: matplotlib colormap (string) to create the RGB ramp
        :param max_value: maximum value
        :param min_value: minimum value
        :param red: Original red band
        :param green: Original green band
        :param blue: Original blue band
        :param burn_in_array: Values to be burnt in
        :param no_data_value: Value to ne unconsidered
        :param color: Tuple of color (R, G, B) to be used in the burn in
        :param fade: Fade the RGB bands to emphasize the copied values
        :param transp: Transparency to use in the mask (0=opaque 1=completely transparent)
        :return: RGB image bands
        """

        if color:
            new_red = np.where(burn_in_array == valid_value, color[0] * (1 - transp) + red * (transp), red * fade)
            new_green = np.where(burn_in_array == valid_value, color[1] * (1 - transp) + green * (transp), green * fade)
            new_blue = np.where(burn_in_array == valid_value, color[2] * (1 - transp) + blue * (transp), blue * fade)

        else:
            # the mask is where the value equals no_data_value
            mask = (burn_in_array == no_data_value)

            # the valid values are those outside the mask (~mask)
            burn_in_values = burn_in_array[~mask]

            # apply scalers to uniform the data
            if uniform_distribution:
                burn_in_values = QuantileTransformer().fit_transform(burn_in_values[:, np.newaxis])[:, 0]
            # burn_in_values = MinMaxScaler((0, 0.3)).fit_transform(burn_in_values)

            # rgb_burn_in_values = DWutils.gray2color_ramp(burn_in_values[:, 0], limits=(0, 0.3))
            rgb_burn_in_values = DWutils.gray2color_ramp(burn_in_values, min_value=min_value, max_value=max_value,
                                                         colormap=colormap, limits=(0, 0.25))

            # return the scaled values to the burn_in_array
            # burn_in_array[~mask] = burn_in_values[:, 0]

            # calculate a color_ramp for these pixels
            # rgb_burn_in_values = DWutils.gray2color_ramp(burn_in_array, limits=(0, 0.3))

            # new_red = np.where(burn_in_array == no_data_value, red, rgb_burn_in_values[:, 0])
            # new_green = np.where(burn_in_array == no_data_value, green, rgb_burn_in_values[:, 1])
            # new_blue = np.where(burn_in_array == no_data_value, blue, rgb_burn_in_values[:, 2])

            # return the scaled values to the burn_in_array
            burn_in_array[~mask] = rgb_burn_in_values[:, 0]
            burn_in_red = np.copy(burn_in_array)

            burn_in_array[~mask] = rgb_burn_in_values[:, 1]
            burn_in_green = np.copy(burn_in_array)

            burn_in_array[~mask] = rgb_burn_in_values[:, 2]
            burn_in_blue = np.copy(burn_in_array)

            # burn in the values
            new_red = np.where(burn_in_array == no_data_value, red*fade, burn_in_red)
            new_green = np.where(burn_in_array == no_data_value, green*fade, burn_in_green)
            new_blue = np.where(burn_in_array == no_data_value, blue*fade, burn_in_blue)

        return new_red, new_green, new_blue

コード例 #30

0

ファイルを表示

ファイル: plot_map_data_to_normal.py プロジェクト: MartinThoma/scikit-learn

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split

print(__doc__)


N_SAMPLES = 1000
FONT_SIZE = 6
BINS = 30


rng = np.random.RandomState(304)
bc = PowerTransformer(method='box-cox')
yj = PowerTransformer(method='yeo-johnson')
qt = QuantileTransformer(output_distribution='normal', random_state=rng)
size = (N_SAMPLES, 1)


# lognormal distribution
X_lognormal = rng.lognormal(size=size)

# chi-squared distribution
df = 3
X_chisq = rng.chisquare(df=df, size=size)

# weibull distribution
a = 50
X_weibull = rng.weibull(a=a, size=size)

# gaussian distribution

コード例 #31

0

ファイルを表示


# In[74]:


train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]
for col in (GENES + CELLS):

    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True