コード例 #1
0
def chang_hug_map(X, hex_colors, FONT_SIZE=12, BINS=30):
    '''
    Function that applies Chang & Hug map of preprocessing data to a normal distribution:
    REF: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py
    
    Parameters:
    * X = features
    * hex_colors = hexadecimal colors to be used for each feature
    * FONT_SIZE = size of font on plots
    * BINS = number of bins on histogram plots
    '''
    # setting preprocessing methods: PowerTransformer (Box-Cox, Yeo-Johnson); QuantileTransformer
    scaler = MinMaxScaler(feature_range=(1, 2))
    boxcox = PowerTransformer(method='box-cox')
    bc = Pipeline(steps=[('s', scaler), ('bc', boxcox)])

    yj = PowerTransformer(method='yeo-johnson')

    rng = np.random.RandomState(304)
    qt = QuantileTransformer(n_quantiles=500,
                             output_distribution='normal',
                             random_state=rng)

    # adding distributions of columns
    distributions = []
    for i in range(0, len(X.columns)):
        name = X.columns[i]
        array = X[X.columns[i]].to_numpy().reshape(-1, 1)
        distributions.append((name, array))

    colors = hex_colors

    # generating the plot
    fig, axes = plt.subplots(
        nrows=12, ncols=15,
        figsize=(35, 25))  # cols = num of preprocessing methods + original
    axes = axes.flatten()
    axes_idxs = [
        (0, 15, 30, 45),
        (1, 16, 31, 46),
        (2, 17, 32, 47),
        (3, 18, 33, 48),
        (4, 19, 34, 49),
        (5, 20, 35, 50),  # first set
        (6, 21, 36, 51),
        (7, 22, 37, 52),
        (8, 23, 38, 53),
        (9, 24, 39, 54),
        (10, 25, 40, 55),
        (11, 26, 41, 56),
        (12, 27, 42, 57),
        (13, 28, 43, 58),
        (14, 29, 44, 59),
        (60, 75, 90, 105),
        (61, 76, 91, 106),
        (62, 77, 92, 107),
        (63, 78, 93, 108),
        (64, 79, 94, 109),
        (65, 80, 95, 110),  # second set
        (66, 81, 96, 111),
        (67, 82, 97, 112),
        (68, 83, 98, 113),
        (69, 84, 99, 114),
        (70, 85, 100, 115),
        (71, 86, 101, 116),
        (72, 87, 102, 117),
        (73, 88, 103, 118),
        (74, 89, 104, 119),
        (120, 135, 150, 165),
        (121, 136, 151, 166),
        (122, 137, 152, 167),
        (123, 138, 153, 168),
        (124, 139, 154, 169),
        (125, 140, 155, 170),
        (126, 141, 156, 171),
        (127, 142, 157, 172),
        (128, 143, 158, 173),
        (129, 144, 159, 174),
        (130, 145, 160, 175),
        (131, 146, 161, 176),
        (132, 147, 162, 177),
        (133, 148, 163, 178),
        (134, 149, 164, 179)
    ]

    axes_list = [(axes[i], axes[j], axes[k], axes[l])
                 for (i, j, k, l) in axes_idxs]

    for distribution, color, axes in zip(distributions, colors, axes_list):
        name, X_col = distribution
        X_train, X_test = train_test_split(X_col,
                                           test_size=0.2,
                                           random_state=rng)

        # perform power and quantile transforms
        X_trans_bc = bc.fit(X_train).transform(X_test)
        lmbda_bc = round(bc.named_steps['bc'].lambdas_[0], 2)
        X_trans_yj = yj.fit(X_train).transform(X_test)
        lmbda_yj = round(yj.lambdas_[0], 2)
        X_trans_qt = qt.fit(X_train).transform(X_test)

        ax_original, ax_bc, ax_yj, ax_qt = axes

        ax_original.hist(X_train, color=color, bins=BINS)
        ax_original.set_title(name, fontsize=FONT_SIZE)
        ax_original.tick_params(axis='both',
                                which='major',
                                labelsize=FONT_SIZE)

        for ax, X_trans, meth_name, lmbda in zip(
            (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt),
            ('Box-Cox', 'Yeo-Johnson', 'Quartile transform'),
            (lmbda_bc, lmbda_yj, None)):
            ax.hist(X_trans, color=color, bins=BINS)
            title = f'After {meth_name}'
            if lmbda is not None:
                title += f'\n$\lambda$ = {lmbda}'
            ax.set_title(title, fontsize=FONT_SIZE)
            ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
            ax.set_xlim([-3.5, 3.5])

    # Setting last plot as empty
    for i in range(-10, 0):
        ax_original, ax_bc, ax_yj, ax_qt = axes_list[i]
        ax_original.axis('off')
        ax_bc.axis('off')
        ax_yj.axis('off')
        ax_qt.axis('off')

    # Export and last adjustments
    plt.tight_layout()
    plt.savefig('fig/09_col_trf.png')
    plt.show()
コード例 #2
0
x_treino, x_teste, y_treino, y_teste = train_test_split(
    x,
    y,
    test_size=0.20,
    random_state=0,  # sorteio aleatorio
)

# Ajustar a escala dos atributos

# Melhores resultados com:
escala = QuantileTransformer()

#2:escala= RobustScaler()
#3: escala = Standart
escala.fit(x_treino)

x_treino = escala.transform(x_treino)
x_teste = escala.transform(x_teste)

x_TESTE = escala.transform(x_TESTE)

#--------------------------------------------------------------------
# Treinar um regressor LINEAR
#--------------------------------------------------------------------

regressor_linear = LinearRegression()
regressor_linear = regressor_linear.fit(x_treino, y_treino)

y_resposta_treino = regressor_linear.predict(x_treino)
y_resposta_teste = regressor_linear.predict(x_teste)
コード例 #3
0
    try:
        y = pd.read_pickle("./results/features/FeatureLabels.pkl").values.ravel()
    except:
        print("The file ./results/features/FeaturesLabels.pkl was not found. Have you tried to run RFE.py in current directory?")
        exit()

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)

############
# Training:
yTrain = yTrain.ravel()
yTest = yTest.ravel()

# Rescale:
qt = QuantileTransformer()
qt.fit(XTrain)
XTrain = qt.transform(XTrain)
XTest = qt.transform(XTest)

print("XTrain max: ", np.max(XTrain))

"""
# LDA:
lda = LDA()
lda.fit(XTrain, yTrain)
XTrain = lda.transform(XTrain)
XTest = lda.transform(XTest)
"""

print("Starting training")
print("XTrain shape: ", XTrain.shape)
コード例 #4
0
#and are therefore not influenced by a few number of very large marginal outliers.
scaler3 = RobustScaler()
scaler3.fit(X)
X3 = scaler3.transform(X)
df3 = pd.DataFrame(data=X3, columns=column_names)
print(df3.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df3, xlim=[-2,3], ylim = [-2,3]) #Range -2 to 3


#4 PowerTransformer
# applies a power transformation to each feature to make the data more Gaussian-like
scaler4 = PowerTransformer()
scaler4.fit(X)
X4 = scaler4.transform(X)
df4 = pd.DataFrame(data=X4, columns=column_names)
print(df4.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df4) #

#5 QuantileTransformer
# has an additional output_distribution parameter allowing to match a 
# Gaussian distribution instead of a uniform distribution.
scaler5 = QuantileTransformer()
scaler5.fit(X)
X5 = scaler5.transform(X)
df5 = pd.DataFrame(data=X5, columns=column_names)
print(df5.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df5) #



コード例 #5
0
ファイル: utilities.py プロジェクト: smkia/DNM
def prepare_data(control_fmri_data, control_phenotype_data, SCHZ_fmri_data, SCHZ_phenotype_data, \
                 ADHD_fmri_data, ADHD_phenotype_data, BIPL_fmri_data, BIPL_phenotype_data, train_num, factor=5, sampling='bootstrap'):
    CTRL_num = control_phenotype_data.shape[0]
    SCHZ_num = SCHZ_phenotype_data.shape[0]
    ADHD_num = ADHD_phenotype_data.shape[0]
    BIPL_num = BIPL_phenotype_data.shape[0]
    x_context = torch.zeros([train_num+15, factor, control_phenotype_data.shape[1]])
    y_context = torch.zeros([train_num+15, factor, control_fmri_data.shape[1], control_fmri_data.shape[2], control_fmri_data.shape[3]])
    x_all = torch.zeros([train_num+15, factor, control_phenotype_data.shape[1]])
    y_all = torch.zeros([train_num+15, factor, control_fmri_data.shape[1], control_fmri_data.shape[2], control_fmri_data.shape[3]])
    
    rand_idx = np.random.permutation(CTRL_num)
    train_idx_ctrl = rand_idx[0:train_num]
    test_idx_ctrl = np.setdiff1d(np.array(range(CTRL_num)),train_idx_ctrl)
    rand_idx = np.random.permutation(SCHZ_num)
    train_idx_SCHZ = rand_idx[0:5]
    test_idx_SCHZ = np.setdiff1d(np.array(range(SCHZ_num)),train_idx_SCHZ)
    rand_idx = np.random.permutation(ADHD_num)
    train_idx_ADHD = rand_idx[0:5]
    test_idx_ADHD = np.setdiff1d(np.array(range(ADHD_num)),train_idx_ADHD)
    rand_idx = np.random.permutation(BIPL_num)
    train_idx_BIPL = rand_idx[0:5]
    test_idx_BIPL = np.setdiff1d(np.array(range(BIPL_num)),train_idx_BIPL)

    x_context_train = torch.cat((control_phenotype_data[train_idx_ctrl,:],
                                      SCHZ_phenotype_data[train_idx_SCHZ,:], ADHD_phenotype_data[train_idx_ADHD,:], BIPL_phenotype_data[train_idx_BIPL,:]))
    means = x_context_train.mean(dim = 0, keepdim = True)
    stds = x_context_train.std(dim = 0, keepdim = True)
    x_context_train = (x_context_train - means) / stds
    x_context_train[x_context_train != x_context_train] = 0
    x_context_train[x_context_train == float("-Inf")] = 0
    x_context_train[x_context_train == float("Inf")] = 0
    
    x_context_test = torch.cat((control_phenotype_data[test_idx_ctrl,:], 
                                SCHZ_phenotype_data[test_idx_SCHZ,:], ADHD_phenotype_data[test_idx_ADHD,:], BIPL_phenotype_data[test_idx_BIPL,:]),0)
    x_context_test = (x_context_test - means) / stds
    x_context_test[x_context_test != x_context_test] = 0
    x_context_test[x_context_test == float("-Inf")] = 0
    x_context_test[x_context_test == float("Inf")] = 0
    
    x_test = x_context_test
    x_context_test = x_context_test.unsqueeze(1).expand(-1,factor,-1)
    
    y_context_train = torch.cat((control_fmri_data[train_idx_ctrl,:,:,:],
                                 SCHZ_fmri_data[train_idx_SCHZ,:,:,:], ADHD_fmri_data[train_idx_ADHD,:,:,:], BIPL_fmri_data[train_idx_BIPL,:,:,:]),0)
    y_test = torch.cat((control_fmri_data[test_idx_ctrl,:,:,:], SCHZ_fmri_data[test_idx_SCHZ,:,:,:], 
                        ADHD_fmri_data[test_idx_ADHD,:,:,:], BIPL_fmri_data[test_idx_BIPL,:,:,:]),0)
    y_context_test = torch.zeros([y_test.shape[0], factor, y_test.shape[1], y_test.shape[2], y_test.shape[3]])
    
    scaler = QuantileTransformer()
    scaler.fit(ravel_2D(np.concatenate((control_fmri_data, SCHZ_fmri_data, ADHD_fmri_data, BIPL_fmri_data),0)))
    
    for i in range(factor):
        if sampling == 'noise':
            x_context[:,i,:] = x_context_train + torch.randn(x_context_train.shape) * 0.01
            x_context_test[:,i,:] = x_context_test[:,i,:] + torch.randn([x_context_test.shape[0],x_context_test.shape[2]]) * 0.01
        elif sampling == 'bootstrap':
            x_context[:,i,:] = x_context_train[:,:]
        idx = np.random.randint(0,x_context_train.shape[0], x_context_train.shape[0])
        for j in range(y_context_train.shape[1]):
            for k in range(y_context_train.shape[2]):
                for l in range(y_context_train.shape[3]):
                    reg = LinearRegression()
                    if sampling == 'noise':
                        reg.fit(x_context[:,i,:].numpy(),y_context_train[:,j,k,l].numpy())
                    elif sampling == 'bootstrap':
                        reg.fit(x_context[idx,i,:].numpy(),y_context_train[idx,j,k,l].numpy())
                        
                    y_context[:,i,j,k,l] = torch.tensor(reg.predict(x_context[:,i,:].numpy()))    
                    y_context_test[:,i,j,k,l] = torch.tensor(reg.predict(x_context_test[:,i,:].numpy()))
        y_context[:,i,:,:,:] = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context[:,i,:,:,:])),y_context[:,i,:,:,:].shape))
        y_context_test[:,i,:,:,:] = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context_test[:,i,:,:,:])),y_context_test[:,i,:,:,:].shape))
        print(i)
    x_all = x_context_train.unsqueeze(1).expand(-1,factor,-1)
    y_all = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context_train)),y_context_train.shape),dtype=torch.float32).unsqueeze(1).expand(-1,factor,-1,-1,-1)
    y_test = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_test)),y_test.shape),dtype=torch.float32)
    y_test = y_test.view((y_test.shape[0],1,y_test.shape[1],y_test.shape[2],y_test.shape[3]))
   
    labels = np.zeros(y_test.shape[0])
    labels[len(test_idx_ctrl):] = 1
    diagnosis_labels = np.zeros(y_test.shape[0])
    diagnosis_labels[len(test_idx_ctrl):len(test_idx_ctrl)+len(test_idx_SCHZ)] = 1
    diagnosis_labels[len(test_idx_ctrl)+len(test_idx_SCHZ):len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD)] = 2
    diagnosis_labels[len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD):len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD)+len(test_idx_BIPL)] = 3
    return x_context, y_context, x_all, y_all, x_context_test, y_context_test, x_test, y_test, labels, diagnosis_labels, scaler
コード例 #6
0
N_SAMPLES = 580 * 1000
DF_LOAD_PATH = "../data/mod_29_rsf"
ENCODER_FILE_PATH = '../models/cnn_encoder_05-14--23-51.h5'
ENCODED_LENGTH = 256

df_samples = pd.read_pickle(DF_LOAD_PATH)

data_as_array = df_samples.values

QTscaler = QuantileTransformer()
MMscaler = MinMaxScaler()

MMscaler.fit(data_as_array[:, :256])
data_as_array[:, :256] = MMscaler.transform(data_as_array[:, :256])

QTscaler.fit(data_as_array[:, :256])
data_as_array[:, :256] = QTscaler.transform(data_as_array[:, :256])

# load encoder
encoder = load_model(ENCODER_FILE_PATH)

#encoder.compile(optimizer='adam', loss='mean_squared_error')
encoded_samples = encoder.predict(data_as_array[:,
                                                0:256].astype(float).reshape(
                                                    N_SAMPLES, 128, 2, 1))

encoded_samples = encoded_samples.reshape(-1, ENCODED_LENGTH)

encoded_column_labels = ['pixel' + str(i) for i in range(ENCODED_LENGTH)]
label_columns_labels = df_samples.columns.values[-2:]
コード例 #7
0
class DfScaler(BaseEstimator, TransformerMixin):
    '''
    Wrapper of several sklearn scalers that keeps the dataframe structure
    '''
    def __init__(self,
                 method='standard',
                 feature_range=(0, 1),
                 n_quantiles=1000,
                 output_distribution='normal',
                 random_state=345):
        super().__init__()
        self.method = method
        self._validate_input()
        self.scale_ = None
        if self.method == 'standard':
            self.scl = StandardScaler()
            self.mean_ = None
        elif method == 'robust':
            self.scl = RobustScaler()
            self.center_ = None
        elif method == 'minmax':
            self.feature_range = feature_range
            self.scl = MinMaxScaler(feature_range=self.feature_range)
            self.min_ = None
            self.data_min_ = None
            self.data_max_ = None
            self.data_range_ = None
            self.n_samples_seen_ = None
        elif method == 'quantile':
            self.n_quantiles = n_quantiles
            self.output_distribution = output_distribution
            self.random_state = random_state
            self.scl = QuantileTransformer(
                n_quantiles=self.n_quantiles,
                output_distribution=self.output_distribution,
                random_state=self.random_state)
            self.n_quantiles_ = None
            self.quantiles_ = None
            self.references_ = None

    def _validate_input(self):
        allowed_methods = ["standard", 'robust', 'minmax', 'quantile']
        if self.method not in allowed_methods:
            raise ValueError(
                f"Can only use these methods: {allowed_methods} got method={self.method}"
            )

    def fit(self, X, y=None):
        self.scl.fit(X)
        if self.method == 'quantile':
            return self
        if self.method == 'standard':
            self.mean_ = pd.Series(self.scl.mean_, index=X.columns)
        elif self.method == 'robust':
            self.center_ = pd.Series(self.scl.center_, index=X.columns)
        elif self.method == 'minmax':
            self.min_ = pd.Series(self.scl.min_, index=X.columns)
            self.data_min_ = pd.Series(self.scl.data_min_, index=X.columns)
            self.data_max_ = pd.Series(self.scl.data_max_, index=X.columns)
            self.data_range_ = self.data_max_ - self.data_min_
            self.n_samples_seen_ = X.shape[0]
        self.scale_ = pd.Series(self.scl.scale_, index=X.columns)
        return self

    def transform(self, X, y=None):
        # assumes X is a DataFrame
        Xscl = self.scl.transform(X)
        Xscaled = pd.DataFrame(Xscl, index=X.index, columns=X.columns)
        return Xscaled
コード例 #8
0
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]
for col in (GENES + CELLS):

    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

コード例 #9
0
            if training_file == True:

                print('Training file == True, so fitting NearestNeighbors...')

                current_trans = current.copy()
                for index in range(1, 7):
                    if index in [2, 5]:
                        trans = QuantileTransformer(
                            n_quantiles=500, output_distribution='normal')
                    else:
                        trans = PowerTransformer(method='box-cox')
                    current_trans[:, index] = (
                        (current_trans[:, index] + 1.) / 2.4) + 0.1
                    X = np.expand_dims(current_trans[:, index], 1)
                    trans.fit(X)
                    dump(
                        trans,
                        open('%strans_%d.pkl' % (transformer_directory, index),
                             'wb'))

                trans_1 = load(
                    open('%strans_1.pkl' % transformer_directory, 'rb'))
                trans_2 = load(
                    open('%strans_2.pkl' % transformer_directory, 'rb'))
                trans_3 = load(
                    open('%strans_3.pkl' % transformer_directory, 'rb'))
                trans_4 = load(
                    open('%strans_4.pkl' % transformer_directory, 'rb'))
                trans_5 = load(
                    open('%strans_5.pkl' % transformer_directory, 'rb'))
コード例 #10
0
class QuantileExtremeValuesTransformer(BaseExtremeValueTransformer):
    """Applies a quantile transformation to columns which have "extreme" values.

    The quantile transformation is ``sklearn.preprocessing.quantile_transform`` that converts columns with extreme
    values to a uniform distribution. Quantiles are computed during the ``fit`` stage and stored as state, which are
    then used in ``transform``.

    A value is considered "extreme" if it is greater than ``quantile`` or less than 100 - ``quantile`` percent of the
    data, and is more than ``threshold_std`` many standard deviations away from the mean. Heavy-tailed distributions are
    therefore more likely to have "extreme" values.

    Number of output columns is the same as number of input columns: each column is either transformed or not.

    Parameters
    ----------
    quantile : int (default = 98)
        Used to calculate the lower and upper cutoff quantiles for a value to be considered "extreme".
        This must be an integer between 0 and 100.

    threshold_std : float (default = 4.0)
        Number of standard deviations away from the mean (in standard units). For a given column, if the magnitude of
        the quantile cutoffs is greater than the threshold_std cutoff, then that column contains an extreme value.
        ``threshold_std`` is converted to nonstandard units:
        ``nonstandard_thresholds = standard_threshold * np.std(X, axis=0) + np.mean(X, axis=0)``.


    Attributes
    ----------
    n_input_features_ : int
        The number of columns in the input dataset.

    quantiles_ : 2D array (2, n_input_features_)
        For each column j, ``quantiles_[0, j]`` is the valueof the ``(100 - quantile)`` percentile and
        ``quantiles_[1, j]`` is the value of the ``quantile`` percentile.

    cols_to_transform_ : list of int
        List of column indices to determine which columns to apply the transformation of ``transform_function``.

    quantile_transformer_ : ``sklearn.preprocessing.QuantileTransformer``
        Instance of ``sklearn.preprocessing.QuantileTransformer``.

    Notes
    -----
    Accepts only two-dimensional, dense input arrays.

    This class inherits from ``sagemaker_sklearn_extension.preprocessing.BaseExtremeValueTransformer``.
    """
    def __init__(self, quantile=98, threshold_std=4.0):
        super().__init__(quantile=quantile, threshold_std=threshold_std)

    def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs, columns to transform, and each column's quantiles.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : QuantileExtremeValueTransformer
        """
        super().fit(X)
        self.quantile_transformer_ = QuantileTransformer(random_state=0,
                                                         copy=True)
        self.quantile_transformer_.fit(X)
        return self

    def _transform_function(self, x, idx=None):
        """Applies single column quantile transform from ``sklearn.preprocessing.QuantileTransformer``.

        Uses ``quantile_transformer_.quantiles_`` calculated during ``fit`` if given an index, otherwise the quantiles
        will be calculated from input ``x``.
        """
        if idx:
            return self.quantile_transformer_._transform_col(  # pylint: disable=protected-access
                x, self.quantile_transformer_.quantiles_[:, idx], False)
        return quantile_transform_nonrandom(x)
コード例 #11
0
import numpy as np

from sklearn.datasets import load_boston

#1 데이터
dataset = load_boston()
x = dataset.data
y = dataset.target

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

scaler = QuantileTransformer()  # 디폴트 : 균등분포
# scaler = QuantileTransformer(output_distribution='normal') # 정규분포

scaler.fit(x)
x = scaler.transform(x)

# QuantileTransformer
print(np.max(x), np.min(x))  # 1.0 0.0
print(np.max(x[0]))  # 1.0

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=104,
                                                    shuffle=True)

#2 모델구성
from tensorflow.keras.models import Sequential, Model
コード例 #12
0
class GymFeature(object):
    """ Describes a feature intended to help predict attendance levels at a gym. """

    KEY_EXCHANGE = 'holiday_calendar'
    GENERIC_SYMBOL = 'SYM'

    def __init__(self,
                 name,
                 transformation,
                 normalization,
                 nbins,
                 length,
                 ndays,
                 resample_minutes,
                 start_market_minute,
                 is_target,
                 exchange_calendar,
                 local,
                 classify_per_series=False,
                 normalise_per_series=False):
        """
        Object containing all the information to manipulate the data relative to a financial feature.
        :param str name: Name of the feature
        :param dict transformation: contains name and parameters to use for processing, name must be in
            FINANCIAL_FEATURE_TRANSFORMATIONS
        :param str/None normalization: type of normalization. Can be None.
        :param int/None nbins: number of bins to be used for target classification. Can be None.
        :param int length: expected number of elements in the feature
        :param int ndays: number of trading days worth of data the feature should use.
        :param int resample_minutes: resampling frequency in number of minutes.
        :param int start_market_minute: number of minutes after market open the data collection should start from.
        :param bool is_target: if True the feature is a target.
        :param pandas_market_calendar exchange_calendar: exchange calendar.
        """
        # FIXME the get_default_flags args are temporary. We need to load a get_default_flags config in the unit tests.

        self.name = name
        self.transformation = Transformation(transformation)
        self.normalization = normalization
        self.nbins = nbins
        self.ndays = ndays
        self.resample_minutes = resample_minutes
        self.start_market_minute = start_market_minute
        self.is_target = is_target
        self.calendar = exchange_calendar
        self.minutes_in_trading_day = self.calendar.get_minutes_in_one_day()
        self.n_series = None
        self.local = local
        self.length = length

        self.bin_distribution = None

        self._assert_input(name, normalization, nbins, length, ndays,
                           resample_minutes, start_market_minute, is_target,
                           local)

        if self.nbins:
            self.bin_distribution_dict = {}
        else:
            self.bin_distribution_dict = None

        self.classify_per_series = classify_per_series
        self.normalise_per_series = normalise_per_series

        if self.normalization:
            self.scaler_dict = {}
            if self.normalization == 'robust':
                self.scaler = RobustScaler()
            elif self.normalization == 'min_max':
                self.scaler = MinMaxScaler()
            elif self.normalization == 'standard':
                self.scaler = StandardScaler()
            elif self.normalization == 'gaussian':
                self.scaler = QuantileTransformer(output_distribution='normal')
            else:
                raise ValueError(
                    'Requested normalisation not supported: {}'.format(
                        self.normalization))
        else:
            self.scaler = None
            self.scaler_dict = None

    @property
    def full_name(self):
        full_name = '{}_{}'.format(self.name, self.transformation.name)
        if self.resample_minutes > 0:
            resolution = '_' + str(self.resample_minutes) + 'T'
            full_name = full_name + resolution

        return full_name

    def _assert_input(self, name, normalization, nbins, length, ndays,
                      resample_minutes, start_market_minute, is_target, local):
        """ Make sure the inputs are sensible. """
        assert isinstance(name, str)
        assert normalization in FINANCIAL_FEATURE_NORMALIZATIONS
        assert (isinstance(nbins, int) and nbins > 0) or nbins is None
        assert isinstance(ndays, int) and ndays >= 0
        assert isinstance(resample_minutes, int) and resample_minutes >= 0
        assert isinstance(start_market_minute, int)
        assert start_market_minute < self.minutes_in_trading_day
        assert (isinstance(length, int) and length > 0)
        assert isinstance(is_target, bool)
        assert isinstance(local, bool)

    def process_prediction_data_x(self, prediction_data_x):
        """
        Apply feature-specific transformations to input prediction_data_x
        :param pd.Dataframe prediction_data_x: X data for model prediction task
        :return pd.Dataframe: processed_prediction_data_x
        """

        assert isinstance(prediction_data_x, pd.DataFrame)

        resampled_data = ResamplingStrategy.resample(
            self, deepcopy(prediction_data_x))

        return self.transformation.transform_x(self, resampled_data)

    def fit_normalisation(self, symbol_data, symbol=None):
        """ Creates a scikitlearn scalar, assigns it to a dictionary, fits it to the data

        :param symbol:
        :param symbol_data:
        :return:
        """

        symbol_data.flatten()
        symbol_data = symbol_data[np.isfinite(symbol_data)]
        symbol_data = symbol_data.reshape(-1, 1)  # Reshape for scikitlearn

        if len(symbol_data) > 0:
            if symbol:
                self.scaler_dict[symbol] = deepcopy(self.scaler)
                self.scaler_dict[symbol].fit(symbol_data)
            else:
                self.scaler.fit(symbol_data)

    def apply_normalisation(self, dataframe):
        """ Compute normalisation across the entire training set, or apply predetermined normalistion to prediction.

        :param dataframe: Features of shape [n_samples, n_series, n_features]
        :type dataframe: pd.DataFrame
        :return:
        """

        for symbol in dataframe:
            data_x = dataframe[symbol].values
            original_shape = data_x.shape
            data_x = data_x.reshape(-1, 1)

            nan_mask = np.ma.fix_invalid(data_x, fill_value=0)

            if self.normalise_per_series:
                if symbol in self.scaler_dict:
                    data_x = self.scaler_dict[symbol].transform(nan_mask.data)
                    # Put the nans back in so we know to avoid them
                    data_x[nan_mask.mask] = np.nan
                    dataframe[symbol] = data_x.reshape(original_shape)
                else:
                    logger.debug(
                        "Symbol lacks normalisation scaler: {}".format(symbol))
                    logger.debug(
                        "Dropping symbol from dataframe: {}".format(symbol))
                    dataframe.drop(symbol, axis=1, inplace=True)
            else:
                data_x = self.scaler.transform(nan_mask.data)
                # Put the nans back in so we know to avoid them
                data_x[nan_mask.mask] = np.nan
                dataframe[symbol] = data_x.reshape(original_shape)

        return dataframe

    def reshape_for_scikit(self, data_x):
        """ Scikit expects an input of the form [samples, features]; normalisation applied separately to each feature.

        :param data_x: Features of shape [n_samples, n_series, n_features]
        :return: nparray Same data as input, but now with two dimensions: [samples, f], each f has own normalisation
        """

        if self.normalise_per_series:
            n_series = data_x.shape[1]
            scikit_shape = (-1, n_series)
        else:
            scikit_shape = (-1, 1)

        return data_x.reshape(scikit_shape)

    def process_prediction_data_y(self, prediction_data_y,
                                  prediction_reference_data):
        """
        Apply feature-specific transformations to input prediction_data_y
        :param pd.Series prediction_data_y: y data for model prediction task
        :param pd.Series prediction_reference_data: reference data-point to calculate differential metrics
        :return pd.Series: processed_prediction_data_y
        """
        assert self.is_target
        assert isinstance(prediction_data_y, pd.Series)

        return self.transformation.transform_y(self, prediction_data_y,
                                               prediction_reference_data)

    def _get_safe_schedule_start_date(self, prediction_timestamp):
        """
        Calculate a safe schedule start date from input timestamp so that at least self.ndays trading days are available
        :param Timestamp prediction_timestamp: Timestamp when the prediction is made
        :return Timestamp: schedule_start_date
        """
        safe_ndays = max(MIN_MARKET_DAYS_SEARCH,
                         MARKET_DAYS_SEARCH_MULTIPLIER * self.ndays)
        return prediction_timestamp - timedelta(days=safe_ndays)

    def declassify_single_predict_y(self, predict_y):
        raise NotImplementedError(
            'Declassification is only available for multi-pass prediction at the moment.'
        )

    def _get_start_timestamp_x(self, prediction_timestamp):
        """
        Calculate the start timestamp of x-data for a given prediction timestamp.
        :param Timestamp prediction_timestamp: Timestamp when the prediction is made
        :return Timestamp: start timestamp of x-data
        """
        schedule_start_date = str(
            self._get_safe_schedule_start_date(prediction_timestamp))
        schedule_end_date = str(prediction_timestamp.date())
        market_open_list = self.calendar.schedule(
            schedule_start_date, schedule_end_date).market_open
        prediction_market_open = market_open_list[prediction_timestamp.date()]
        prediction_market_open_idx = np.argwhere(
            market_open_list == prediction_market_open).flatten()[0]
        start_timestamp_x = market_open_list[
            prediction_market_open_idx -
            self.ndays] + timedelta(minutes=self.start_market_minute)
        return start_timestamp_x

    def _index_selection_x(self, date_time_index, prediction_timestamp):
        """
        Create index selection rule for x data
        :param Timestamp prediction_timestamp: Timestamp when the prediction is made
        :return: index selection rule
        """
        start_timestamp_x = self._get_start_timestamp_x(prediction_timestamp)
        return (date_time_index >= start_timestamp_x) & (date_time_index <=
                                                         prediction_timestamp)

    def _select_prediction_data_x(self, data_frame, prediction_timestamp):
        """
        Select the x-data relevant for a input prediction timestamp.
        :param pd.Dataframe data_frame: raw x-data (unselected, unprocessed)
        :param Timestamp prediction_timestamp: Timestamp when the prediction is made
        :return pd.Dataframe: selected x-data (unprocessed)
        """

        try:
            n_rows = len(data_frame.index)
            end_point = data_frame.index.get_loc(prediction_timestamp,
                                                 method='pad')
            end_index = end_point + 1  # +1 because iloc is not inclusive of end index
            start_index = end_point - self.length + 1

            # Check if we're violating range of dataframe
            if end_index >= n_rows:
                offset = end_index - n_rows + 1
                start_index -= offset
                end_index -= offset
        except:
            logger.debug(
                'Prediction timestamp {} not within range of dataframe'.format(
                    prediction_timestamp))
            start_index = 0
            end_index = -1

        return data_frame.iloc[start_index:end_index, :]

    def _select_prediction_data_y(self, data_frame, target_timestamp,
                                  n_forecasts):
        """
        Select the y-data for a prediction timestamp.
        :param pd.Dataframe data_frame: raw data (unselected, unprocessed)
        :param Timestamp prediction_timestamp: Timestamp when the prediction is made
        :return pd.Dataframe: selected y-data (unprocessed)
        """

        try:
            n_rows = len(data_frame.index)
            end_point = data_frame.index.get_loc(target_timestamp,
                                                 method='pad')
            start_index = end_point + 1  # +1 because iloc is not inclusive of end index, other+1 due to forecast
            end_index = start_index + n_forecasts

            # Check if we're violating range of dataframe
            if end_index >= n_rows:
                offset = end_index - n_rows + 1
                start_index -= offset
                end_index -= offset
        except:
            logger.debug(
                'Target timestamp {} not within range of dataframe'.format(
                    target_timestamp))
            start_index = 0
            end_index = -1

        return data_frame.iloc[start_index:end_index, :]

    def get_prediction_targets(self,
                               data_frame,
                               prediction_timestamp,
                               target_timestamp=None,
                               n_forecasts=1):
        """
        Compute targets from dataframe only if the current feature is target

        :param data_frame: Time indexed data
        :type data_frame: pd.DataFrame
        :param prediction_timestamp: the time of prediction
        :type prediction_timestamp: pd.Timestamp
        :param target_timestamp: the time predicted
        :type target_timestamp: pd.Timestamp
        :rtype pd.DataFrame
        """
        prediction_target = None

        if self.is_target and target_timestamp:
            prediction_target = self._select_prediction_data_y(
                data_frame, prediction_timestamp, n_forecasts)

        return prediction_target

    def get_prediction_features(self, data_frame, prediction_timestamp):
        """
        Compute features from dataframe

        :param data_frame: Time indexed data
        :type data_frame: pd.DataFrame
        :param prediction_timestamp: the time of prediction
        :type prediction_timestamp: pd.Timestamp
        :rtype: pd.DataFrame
        """
        prediction_features = self._select_prediction_data_x(
            data_frame, prediction_timestamp)

        if self.local:
            prediction_features = self.process_prediction_data_x(
                prediction_features)

        return prediction_features

    def fit_classification(self, symbol, symbol_data):
        """  Fill dict with classifiers

        :param symbol:
        :rtype symbol: str
        :param symbol_data:
        :return:
        """

        if self.nbins is None:
            return

        self.bin_distribution_dict[symbol] = BinDistribution(
            symbol_data, self.nbins)

    def apply_classification(self, dataframe):
        """ Apply predetermined classification to y data.

        :param pd panel data_x: Features of shape [n_samples, n_series, n_features]
        :return:
        """

        n_timesteps = len(dataframe.index)
        hot_panel = pd.Panel(0,
                             items=dataframe.columns,
                             major_axis=np.arange(n_timesteps),
                             minor_axis=np.arange(self.nbins))

        for symbol in dataframe:
            data_y = dataframe[symbol].values

            key = symbol if self.classify_per_series else self.GENERIC_SYMBOL

            if key in self.bin_distribution_dict:
                symbol_distribution = self.bin_distribution_dict[key]
                one_hot_labels = symbol_distribution.classify_labels(data_y)
                if one_hot_labels.shape[-1] > 1:
                    hot_panel[symbol] = one_hot_labels
            else:
                logger.debug(
                    "Symbol lacks clasification bins: {}".format(symbol))
                hot_panel.drop(symbol, axis=0, inplace=True)
                logger.debug("Dropping {} from dataframe.".format(symbol))

        return hot_panel.transpose(1, 0, 2)  # Puts time back as the index

    def inverse_transform_multi_predict_y(self,
                                          predict_y,
                                          symbols,
                                          confidence_interval=0.68):
        """
        Inverse-transform multi-pass predict_y data
        :param pd.Dataframe predict_y: target multi-pass prediction
        :return pd.Dataframe: inversely transformed mean and variance of target multi-pass prediction
        """
        assert self.is_target

        n_symbols = len(symbols)
        n_forecasts = predict_y.shape[2]

        data_shape = (n_forecasts, n_symbols)
        means = np.zeros(shape=data_shape, dtype=np.float32)
        lower_bound = np.zeros(shape=data_shape, dtype=np.float32)
        upper_bound = np.zeros(shape=data_shape, dtype=np.float32)
        assert predict_y.shape[
            1] == n_symbols, "Weird shape - predict y not equal to n symbols"

        for i, symbol in enumerate(symbols):
            key = symbol if self.classify_per_series else self.GENERIC_SYMBOL

            for j in range(n_forecasts):
                pdf = predict_y[:, i, j, :]
                if key in self.bin_distribution_dict and not np.any(
                        np.isnan(pdf)):
                    symbol_bins = self.bin_distribution_dict[key]
                    try:
                        means[j, i], lower_bound[j, i], upper_bound[j, i] = \
                            symbol_bins.estimate_confidence_interval(pdf, confidence_interval)
                    except Exception as e:
                        logging.debug(e)
                        raise e
                else:
                    logger.debug(
                        "Nans or no bin distribution found for symbol: {}".
                        format(symbol))
                    means[j, i] = np.nan
                    lower_bound[j, i] = np.nan
                    upper_bound[j, i] = np.nan

        return means, lower_bound, upper_bound

    def __repr__(self):
        return '<{} object: name: {}. full_name: {}>'.format(
            self.__class__.__name__, self.name, self.full_name)
コード例 #13
0
plt.hist(dat_skew, bins = 25)
plt.show()

#%% convert the data into a dataframe
dat_skew = dat_skew.reshape((len(dat), 1))
dat_skew = pd.DataFrame(dat_skew, columns = ['Value'])

#%% generate and fit log transformer
lgt = vt.LogTransformer(variables= ['Value'])
lgt.fit(dat_skew)

#%% apply log transformation 
dat_lg = lgt.transform(dat_skew)

#%% plot the distribution of the transformed data
plt.hist(dat_lg['Value'], bins=25)
plt.show()

#%% generate and fit quantile transformer
qt = QuantileTransformer(output_distribution='normal')
qt.fit(dat_skew[['Value']])

#%% apply quantile transformation 
dat_q = qt.transform(dat_skew[['Value']])

#%% plot the distribution of the transformed data
plt.hist(dat_q, bins=25)
plt.show()

# %%
コード例 #14
0
class KDEQuantileTransformer(TransformerMixin, BaseEstimator):
    """ Quantile tranformer class using for each variable the CDF obtained with kernel density estimation
    """
    def __init__(self,
                 n_quantiles=1000,
                 output_distribution='uniform',
                 smooth_peaks=True,
                 mirror_left=None,
                 mirror_right=None,
                 rho=0.5,
                 n_adaptive=1,
                 x_min=None,
                 x_max=None,
                 n_integral_bins=1000,
                 use_KDE=True,
                 use_inverse_qt=False,
                 random_state=0,
                 copy=True):
        """ Parameters with the class KDEQuantileTransformer

        KDEQuantileTransformer is a quantile tranformer class using for each variable the CDF obtained with
        kernel density estimation. Besides normal transformation functions, the class also provides the jacobian
        and inverse jacobian of the transformation and inverse transformation respectively.

        The KDE quantile transformation happens in four steps, two of which are transformations:

        1. First KDE PDFs and CDFs are formed for all marginalized input variables.
        2. Using the (smooth) CDFs, all input variables are transformed to uniform distributions.
        3. Using the existing quantile transformer of sklearn, these uniform distributions are then transformed to
           normal distributions.
        4. The KDE PDFs are used to calculate the (inverse) jacobian of the transformation.

        Concerning KDE evaluation of the PDF and CDF, the adaptive bandwidths are evaluated with the eqns described in:
        Cranmer KS, Kernel Estimation in High-Energy Physics. Computer Physics Communications 136:198-207, 2001
        e-Print Archive: hep ex/0011057

        In theory both transformations could be combined into one, but there are practical advantages of using two.
        Essentially the second transformation is a backup against the first one, to smooth out residual bumps.
        For certain edge case distributions, for example those with strange discrete peaks in them at the edge
        of a distribution, it may happen that a single transformation fails, in which case doing two quantile
        transformations catches any potential imperfections in the first.
        In the inverse transformation, by default the two transformations are combined into one however, b/c else
        the impact of KDE smoothing is cancelled.

        :param int n_quantiles: number of quantiles/bins used in output histogram. If greater than number of samples,
            this is reset to number of samples. Default is 1000.
        :param str output_distribution: 'uniform' or 'normal' distribution.
        :param bool smooth_peaks: if False, do not smear peaks of non-unique values.
        :param mirror_left: array. Mirror the data on a value on the left to counter signal leakage.
            Default is None, which is no mirroring.
        :param mirror_right: array. Mirror the data on a value on the right to counter signal leakage.
            Default is None, which is no mirroring.
        :param float rho: KDE bandwidth scale parameter. default is 0.5.
        :param int n_adaptive: KDE number of adaptive iterations to be applied to improve the band width. default is 1.
        :param x_min: array. minimum value of pdf's x range. default is None (= - inf)
        :param x_max: array. maximum value of pdf's x range. default is None (= + inf)
        :param int n_integral_bins: for internal evaluation, number of integration bins beyond x-range. default is 1000.
        :param bool use_KDE: Default is True. If false, KDE smoothing is off, using default quantile transformation.
        :param bool use_inverse_qt: Default is False. If true, KDE is not used in inverse transformation.
        :param int random_state: when an integer, the seed given random generator.
        :param copy: Copy the data before transforming. Default is True.
        """
        self.n_quantiles = n_quantiles
        self.output_distribution = output_distribution
        self.smooth_peaks = smooth_peaks
        self.n_adaptive = n_adaptive
        self.copy = copy
        self.use_inverse_qt = use_inverse_qt
        self.use_KDE = use_KDE
        self.n_integral_bins = max(n_integral_bins, 1000)
        self.random_state = random_state

        # integration range
        self.x_min = np.array(x_min) if isinstance(x_min,
                                                   (list, tuple,
                                                    np.ndarray)) else None
        self.x_max = np.array(x_max) if isinstance(x_max,
                                                   (list, tuple,
                                                    np.ndarray)) else None

        # left and right-hand mirror points
        self.mirror_left = np.array(mirror_left) if isinstance(
            mirror_left, (list, tuple, np.ndarray)) else None
        self.mirror_right = np.array(mirror_right) if isinstance(
            mirror_right, (list, tuple, np.ndarray)) else None

        # copy x ranges if mirror points not set
        self.mirror_left = self.x_min if self.mirror_left is None else self.mirror_left
        self.mirror_right = self.x_max if self.mirror_right is None else self.mirror_right

        # bandwidth rescaling factor
        self.rho = np.array(rho) if isinstance(rho, (list, tuple,
                                                     np.ndarray)) else rho

        # basic checks on attributes
        if self.n_quantiles <= 0:
            raise ValueError(
                "Invalid value for 'n_quantiles': %d. The number of quantiles must be at least one."
                % self.n_quantiles)
        if self.output_distribution not in ('normal', 'uniform'):
            raise ValueError(
                "'output_distribution' has to be either 'normal' or 'uniform'. Got '{}' instead."
                % self.output_distribution)
        if (isinstance(self.rho, np.ndarray) and any([r <= 0 for r in self.rho])) or \
                (isinstance(self.rho, (float, np.number)) and self.rho <= 0):
            raise ValueError(
                "Invalid value(s) for 'rho': %f. The number(s) must be greater than zero."
                % self.rho)
        if self.n_adaptive < 0:
            raise ValueError(
                "Invalid value for 'n_adaptive': %d. Must be positive." %
                self.n_adaptive)

    def fit(self, X, y=None):
        """Compute the kde-based quantiles used for transforming.

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :param y: Ignored
        :return: self : object
        """
        X = check_array(X,
                        copy=False,
                        dtype=FLOAT_DTYPES,
                        force_all_finite="allow-nan")

        # sample profiles
        n_samples, n_features = X.shape

        # continuation of basic checks, now that we know X
        if isinstance(self.rho, np.ndarray):
            if self.rho.shape[0] != n_features:
                raise ValueError(
                    "Invalid size of 'rho': %d. The number should match the data: %d."
                    % (self.rho.shape[0], n_features))
        else:
            self.rho = np.array([self.rho] * n_features)
        if isinstance(self.mirror_left, np.ndarray):
            if self.mirror_left.shape[0] != n_features:
                raise ValueError(
                    "Invalid size of 'mirror_left': %d. The number should match the data: %d."
                    % (self.mirror_left.shape[0], n_features))
        else:
            self.mirror_left = np.array([None] * n_features)
        if isinstance(self.mirror_right, np.ndarray):
            if self.mirror_right.shape[0] != n_features:
                raise ValueError(
                    "Invalid size of 'mirror_right': %d. The number should match the data: %d."
                    % (self.mirror_right.shape[0], n_features))
        else:
            self.mirror_right = np.array([None] * n_features)
        if isinstance(self.x_min, np.ndarray):
            if self.x_min.shape[0] != n_features:
                raise ValueError(
                    "Invalid size of 'x_min': %d. The number should match the data: %d."
                    % (self.x_min.shape[0], n_features))
        else:
            self.x_min = np.array([None] * n_features)
        if isinstance(self.x_max, np.ndarray):
            if self.x_max.shape[0] != n_features:
                raise ValueError(
                    "Invalid size of 'x_max': %d. The number should match the data: %d."
                    % (self.x_max.shape[0], n_features))
        else:
            self.x_max = np.array([None] * n_features)

        # number of quantiles cannot be higher than number of data points. If so, reset.
        if self.n_quantiles > n_samples:
            warnings.warn("n_quantiles (%s) is greater than the total number "
                          "of samples (%s). n_quantiles is set to "
                          "n_samples." % (self.n_quantiles, n_samples))
        self.n_quantiles = max(1, min(self.n_quantiles, n_samples))

        # set the (x_min, x_max) transformation range
        # if not set, by default widen the range beyond min/max to account for signal leakage
        if any([x is None
                for x in self.x_min]) or any([x is None for x in self.x_max]):
            gstd = np.std(X, axis=0)
            bw = np.power(4 / 3, 0.2) * gstd * np.power(n_samples, -0.2)
            min_orig = np.min(X, axis=0) - 10 * bw
            max_orig = np.max(X, axis=0) + 10 * bw
            for i in range(n_features):
                self.x_min[i] = min_orig[i] if (
                    self.x_min[i] is None and gstd[i] > 0) else self.x_min[i]
                self.x_max[i] = max_orig[i] if (
                    self.x_max[i] is None and gstd[i] > 0) else self.x_max[i]

        if self.use_KDE:
            # Do the actual KDE fit (to uniform distributions)
            self._kde_fit(X)
            # prepare X to do quantile transformer fit.
            # add extreme points so QT knows the true edges for inverse transformation after sampling
            X = self._kde_transform(X)
            low = np.array([[0] * X.shape[1]])
            high = np.array([[1] * X.shape[1]])
            X = np.concatenate([X, low, high], axis=0)
        elif self.smooth_peaks:
            X = self._smooth_peaks(X)
            # create pdf for quantile transformation
            self._qt_pdf(X)

        # perform quantile transformation to smooth out any residual imperfections after kde
        # standard quantile transformer helps to smooth out any residual imperfections after kde transformation,
        # and does conversion to normal.
        self.qt_ = QuantileTransformer(
            n_quantiles=self.n_quantiles,
            output_distribution=self.output_distribution,
            copy=self.copy)
        self.qt_.fit(X)

        return self

    def _qt_pdf(self, X, min_pdf_value=1e-20):
        """Internal function to make quantile transformer pdf

        Is only run when use_KDE=False

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        """
        self.pdf_ = []

        n_samples, n_features = X.shape
        ps = np.linspace(0, 1, self.n_quantiles + 1)

        # calculate quantiles and pdf
        for i in range(n_features):
            x = X[:, i]
            qs = np.quantile(x, ps)
            bin_entries, bin_edges = np.histogram(x, bins=qs)
            bin_diffs = np.diff(bin_edges)
            pdf_norm = bin_entries / n_samples / bin_diffs
            fast_pdf = interpolate.interp1d(bin_edges[:-1],
                                            pdf_norm,
                                            kind='previous',
                                            bounds_error=False,
                                            fill_value=(min_pdf_value,
                                                        min_pdf_value))
            self.pdf_.append({'fast': fast_pdf})

    def _kde_fit(self, X):
        """Internal function to compute the kde-based quantiles used for transforming.

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: self : object
        """
        # reset
        self.pdf_ = []
        self.cdf_ = []

        n_features = X.shape[1]

        for i in range(n_features):
            # do kde fit, store each pdf
            bin_entries, bin_mean = kde_process_data(
                X[:, i],
                self.n_quantiles,
                self.smooth_peaks,
                self.mirror_left[i],
                self.mirror_right[i],
                random_state=self.random_state)
            band_width = kde_bw(bin_mean, bin_entries, self.rho[i],
                                self.n_adaptive)
            # transformers to uniform distribution and back
            fast_pdf, F, Finv, kde_norm = kde_make_transformers(
                bin_mean,
                bin_entries,
                band_width,
                x_min=self.x_min[i],
                x_max=self.x_max[i],
                n_bins=self.n_integral_bins)
            # store cdf, inverse-cdf, and pdf.
            self.cdf_.append((F, Finv))
            pdf = {
                'bin_entries': bin_entries,
                'bin_mean': bin_mean,
                'band_width': band_width,
                'norm': kde_norm,
                'fast': fast_pdf
            }
            self.pdf_.append(pdf)

        return self

    def _smooth_peaks(self, X):
        """Internal function to smooth non-unique peaks

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The transformed data
        """
        X = check_array(X,
                        copy=self.copy,
                        dtype=FLOAT_DTYPES,
                        force_all_finite="allow-nan")

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            # smooth peaks - note: this adds a random component to the data
            # applying smoothing to data that's already been smoothed has no impact, b/c all peaks are already gone.
            x = kde_smooth_peaks_1dim(x,
                                      self.mirror_left[feature_idx],
                                      self.mirror_right[feature_idx],
                                      copy=False,
                                      random_state=self.random_state)
            X[:, feature_idx] = x
        return X

    def _kde_transform(self, X):
        """Internal function to transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The transformed data
        """
        X = check_array(X,
                        copy=self.copy,
                        dtype=FLOAT_DTYPES,
                        force_all_finite="allow-nan")

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            # smooth peaks - note: this adds a random component to the data
            # applying smoothing to data that's already been smoothed has no impact, b/c all peaks are already gone.
            if self.smooth_peaks:
                x = kde_smooth_peaks_1dim(x,
                                          self.mirror_left[feature_idx],
                                          self.mirror_right[feature_idx],
                                          copy=False,
                                          random_state=self.random_state)
            # transform distribution to uniform
            y = self.cdf_[feature_idx][0](x)
            # transform uniform [0,1] distribution to normal
            # X[:, feature_idx] = np.sqrt(2.) * erfinv(2. * y - 1.) if self.output_distribution == 'normal' else y
            X[:, feature_idx] = y

        return X

    def transform(self, X):
        """Transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The transformed data
        """
        # 1. kde transformation to uniform.
        if self.use_KDE:
            X = self._kde_transform(X)
        elif self.smooth_peaks:
            X = self._smooth_peaks(X)

        # 2. quantile transformation to smooth out residual bumps and do conversion to normal distribution
        return self.qt_.transform(X)

    def _kde_inverse_transform(self, X):
        """Internal function to inverse transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to inverse scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The inverse-transformed data
        """
        n_features = X.shape[1]
        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            # transform normal back to uniform [0,1]
            if not self.use_inverse_qt:
                x = (0.5 + 0.5 * erf(x / np.sqrt(2.))
                     ) if self.output_distribution == 'normal' else x
            # transform uniform back to original distribution
            X[:, feature_idx] = self.cdf_[feature_idx][1](x)

        return X

    def inverse_transform(self, X):
        """Inverse transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to inverse scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The inverse-transformed data
        """
        # 1. quantile transformation back to kde
        if self.use_inverse_qt or not self.use_KDE:
            X = self.qt_.inverse_transform(X)
        # 2. inverse kde transformation
        return self._kde_inverse_transform(X) if self.use_KDE else X

    def jacobian(self, X):
        """Provide the Jacobian of the transformation

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, )
            An array with the jacobian of each data point
        """
        X = check_array(X,
                        copy=self.copy,
                        dtype=FLOAT_DTYPES,
                        force_all_finite="allow-nan")

        # smoothing of peaks
        if self.smooth_peaks:
            X = self._smooth_peaks(X)

        jac = 1.0

        for idx in range(X.shape[1]):
            kdfi = self.pdf_[idx]['fast']
            jac /= kdfi(X[:, idx])

        if self.output_distribution == 'normal':
            X = self.transform(X)
            for idx in range(X.shape[1]):
                jac *= norm.pdf(X[:, idx])

        return jac

    def inverse_jacobian(self, X):
        """Provide the Jacobian of the inverse transformation

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to inverse scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, )
            An array with the jacobian of the inverse transformation of each input data point
        """
        X = check_array(X,
                        copy=self.copy,
                        dtype=FLOAT_DTYPES,
                        force_all_finite="allow-nan")

        inv_jac = 1.0

        if self.output_distribution == 'normal':
            for idx in range(X.shape[1]):
                inv_jac /= norm.pdf(X[:, idx])

        X = self.inverse_transform(X)

        for idx in range(X.shape[1]):
            kdfi = self.pdf_[idx]['fast']
            inv_jac *= kdfi(X[:, idx])

        return inv_jac
コード例 #15
0
def scaler_uniform():
    scaler_ = QuantileTransformer()
    X = np.random.uniform(20, 30, (1000, 10))
    scaler_.fit(X)
    return scaler_
コード例 #16
0
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")



y_train = train_df["target"].values
X_train = train_df.iloc[:, 2:].values
train_id = train_df["id"].values
X_test = test_df.iloc[:, 1:].values
test_id = test_df["id"].values



quantile_transformer = QuantileTransformer(n_quantiles=1000, output_distribution="normal", ignore_implicit_zeros=False, subsample=100000, random_state=42, copy=True)
quantile_transformer.fit(np.vstack([X_train, X_test]))
X_quantile_train = quantile_transformer.transform(X_train)
X_quantile_test = quantile_transformer.transform(X_test)



ridge_classifier = RidgeClassifier(copy_X=True, max_iter=None, tol=0.001, random_state=42)
repeated_kfold = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)
search_grid = {
    "alpha": np.geomspace(1e-3, 1e3, 50),
    "fit_intercept": [True, False],
    "normalize": [True, False],
    "class_weight": [None, "balanced"],
    "solver": ["svd", "cholesky", "sparse_cg", "lsqr"]
}
grid_search = GridSearchCV(ridge_classifier, search_grid, scoring="roc_auc", n_jobs=1, iid=True, refit=True, cv=repeated_kfold, verbose=True, error_score=0.0, return_train_score=False)
コード例 #17
0
def scaler_normal():
    scaler_ = QuantileTransformer(output_distribution='normal',
                                  n_quantiles=100)
    X = np.random.uniform(20, 30, (1000, 10))
    scaler_.fit(X)
    return scaler_
コード例 #18
0
ファイル: clustering.py プロジェクト: whuss/dashboard_react
def input_data_clustering(device: str,
                          start_date: date,
                          end_date: Optional[date] = None,
                          n_clusters=5,
                          return_only_cluster=True,
                          return_pca=False) -> pd.DataFrame:
    def add_column_postfix(df: pd.DataFrame, postfix: str) -> pd.DataFrame:
        columns = df.columns
        mapping = {c: f"{c}_{postfix}" for c in columns}
        return df.rename(columns=mapping)

    # get normalized input data
    data = get_input_data(device,
                          start_date,
                          end_date=end_date,
                          normalized=True)
    if data.empty:
        return data

    # compute statistics over rolling window
    rolling = data.rolling('15Min', min_periods=1, win_type=None)
    data_rolling_ = list()
    data_rolling_.append(add_column_postfix(rolling.count(), "count"))
    data_rolling_.append(add_column_postfix(rolling.sum(), "sum"))
    data_rolling_.append(add_column_postfix(rolling.mean(), "mean"))
    data_rolling_.append(add_column_postfix(rolling.median(), "median"))
    data_rolling_.append(add_column_postfix(rolling.var(), "var"))
    data_rolling_.append(add_column_postfix(rolling.kurt(), "kurt"))
    data_rolling_.append(add_column_postfix(rolling.skew(), "skew"))
    data_rolling = pd.concat(data_rolling_, axis=1)
    data_rolling = data_rolling.loc[~data_rolling.index.duplicated(
        keep='first')]
    data_rolling = data_rolling.resample("1Min").nearest(limit=1).dropna(
        how='all')

    from analytics.instruction import get_power
    power_data = get_power(device, start_date)
    power_data_rolling = power_data.rolling('15Min',
                                            min_periods=1,
                                            win_type=None).mean()
    data_rolling = data_rolling.merge(power_data_rolling,
                                      how='left',
                                      left_index=True,
                                      right_index=True)
    data_rolling = data_rolling[data_rolling.power >= 0.95]
    data_rolling = data_rolling.drop(columns='power')

    # normalize rolling data
    st_rolling = QuantileTransformer(output_distribution="normal")
    st_rolling.fit(data_rolling)
    data_rolling_normalized = pd.DataFrame(st_rolling.transform(data_rolling),
                                           columns=data_rolling.columns,
                                           index=data_rolling.index).fillna(0)

    # We do not have enough data for a clustering
    if len(data_rolling_normalized) < n_clusters:
        return pd.DataFrame()

    # perform PCA
    pca = PCA(random_state=31415)
    pca.fit(data_rolling_normalized)

    variance = np.cumsum(pca.explained_variance_ratio_)

    # how many dimensions to keep for variance over 0.95
    n_dims = variance[variance <= 0.95].shape[0] + 1

    data_pca = pca.transform(data_rolling_normalized)[:, :n_dims]

    # Cluster the data into 5 clusters
    k_means = KMeans(n_clusters=n_clusters, random_state=31415)
    clustering = k_means.fit_predict(data_pca)

    if return_pca:
        cluster_df = pd.DataFrame(clustering, columns=['cluster'])
        pca_df = pd.DataFrame(data_pca)
        pca_df.columns = [f"d_{c}" for c in pca_df.columns]
        return pd.concat([cluster_df, pca_df], axis=1)

    data_rolling.loc[:, 'cluster'] = clustering
    if return_only_cluster:
        return data_rolling[['cluster']]
    return data_rolling
コード例 #19
0
def scaler_bool_normal():
    scaler_ = QuantileTransformer(output_distribution='normal')
    X = np.random.choice([22., 27.], (1000, 10), (0.8, 0.2))
    scaler_.fit(X)
    return scaler_
コード例 #20
0
axes = axes.flatten()
axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),
             (13, 16, 19, 22), (14, 17, 20, 23)]
axes_list = [(axes[i], axes[j], axes[k], axes[l])
             for (i, j, k, l) in axes_idxs]

for distribution, color, axes in zip(distributions, colors, axes_list):
    name, X = distribution
    X_train, X_test = train_test_split(X, test_size=.5)

    # perform power transforms and quantile transform
    X_trans_bc = bc.fit(X_train).transform(X_test)
    lmbda_bc = round(bc.lambdas_[0], 2)
    X_trans_yj = yj.fit(X_train).transform(X_test)
    lmbda_yj = round(yj.lambdas_[0], 2)
    X_trans_qt = qt.fit(X_train).transform(X_test)

    ax_original, ax_bc, ax_yj, ax_qt = axes

    ax_original.hist(X_train, color=color, bins=BINS)
    ax_original.set_title(name, fontsize=FONT_SIZE)
    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)

    for ax, X_trans, meth_name, lmbda in zip(
        (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt),
        ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
        (lmbda_bc, lmbda_yj, None)):
        ax.hist(X_trans, color=color, bins=BINS)
        title = 'After {}'.format(meth_name)
        if lmbda is not None:
            title += r'\n$\lambda$ = {}'.format(lmbda)
コード例 #21
0
def scaler_delta_normal():
    scaler_ = QuantileTransformer(output_distribution='normal')
    X = np.full((10000, 10), np.pi)
    scaler_.fit(X)
    return scaler_
コード例 #22
0
axes_idxs = [(0, 3, 6, 9), (1, 4, 7, 10), (2, 5, 8, 11), (12, 15, 18, 21),
             (13, 16, 19, 22), (14, 17, 20, 23)]
axes_list = [(axes[i], axes[j], axes[k], axes[l])
             for (i, j, k, l) in axes_idxs]


for distribution, color, axes in zip(distributions, colors, axes_list):
    name, X = distribution
    X_train, X_test = train_test_split(X, test_size=.5)

    # perform power transforms and quantile transform
    X_trans_bc = bc.fit(X_train).transform(X_test)
    lmbda_bc = round(bc.lambdas_[0], 2)
    X_trans_yj = yj.fit(X_train).transform(X_test)
    lmbda_yj = round(yj.lambdas_[0], 2)
    X_trans_qt = qt.fit(X_train).transform(X_test)

    ax_original, ax_bc, ax_yj, ax_qt = axes

    ax_original.hist(X_train, color=color, bins=BINS)
    ax_original.set_title(name, fontsize=FONT_SIZE)
    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)

    for ax, X_trans, meth_name, lmbda in zip(
            (ax_bc, ax_yj, ax_qt),
            (X_trans_bc, X_trans_yj, X_trans_qt),
            ('Box-Cox', 'Yeo-Johnson', 'Quantile transform'),
            (lmbda_bc, lmbda_yj, None)):
        ax.hist(X_trans, color=color, bins=BINS)
        title = 'After {}'.format(meth_name)
        if lmbda is not None:
コード例 #23
0
ファイル: keras-mlp.py プロジェクト: hsed/ml-cw
total_records = np_data.shape[0]

#classes
#labels = data.ix[:,-1].values.astype('int32')

train_rec = int(0.7*total_records)  # approx 70%
#test_rec = total_records - train_rec

#X_all = np_data[:, :-1]
#y_all = np_data[:, -1].astype(int)
scaler = QuantileTransformer()

X_train = np_data[:train_rec,:-1]
y_train = np_data[:train_rec,-1].astype(int)
scaler.fit(X_train)

X_test = np_data[train_rec:,:-1]
y_test = np_data[train_rec:,-1].astype(int)
scaler.fit_transform(X_train)
#print("x_train: \n", X_train, "\n\n y_train: ", y_train)

#raise SystemExit
#print("Labels: ", y_one_hot_train)
# convert list of labels to binary class matrix
#y_train = np_utils.to_categorical(labels)


#input_dim = X_train.shape[1]
#nb_classes = y_train.shape[1]
# define 10-fold cross validation test harness