コード例 #1
0
	# load the dataset as a numpy array
	data = read_csv(full_path, header=None)
	# retrieve numpy array
	data = data.values
	# split into input and output elements
	X, y = data[:, :-1], data[:, -1]
	# label encode the target variable to have the classes 0 and 1
	y = LabelEncoder().fit_transform(y)
	return X, y

# define the location of the dataset
full_path = 'haberman.csv'
# load the dataset
X, y = load_dataset(full_path)
# fit the model
steps = [('t1', MinMaxScaler()),('t2', PowerTransformer()),('m',LogisticRegression(solver='lbfgs'))]
model = Pipeline(steps=steps)
model.fit(X, y)
# some survival cases
print('Survival Cases:')
data = [[31,59,2], [31,65,4], [34,60,1]]
for row in data:
	# make prediction
	yhat = model.predict_proba([row])
	# get percentage of survival
	p_survive = yhat[0, 0] * 100
	# summarize
	print('>data=%s, Survival=%.3f%%' % (row, p_survive))
# some non-survival cases
print('Non-Survival Cases:')
data = [[44,64,6], [34,66,9], [38,69,21]]
コード例 #2
0
def get_data(ssX=None, batch_size=32, train=True, **kwargs):
    """
    inputs:
        batch_size: int

    return:
        (dataloader, test_dataloader)
    """
    plot_random = False if 'plot_random' not in kwargs else kwargs[
        'plot_random']
    plot_resonant = not plot_random
    train_all = False if 'train_all' not in kwargs else kwargs['train_all']
    plot = False if 'plot' not in kwargs else kwargs['plot']
    if not train_all and ssX is None:
        plot_resonant = True
        plot_random = False

    if train_all:
        filename = 'data/combined.pkl'
    elif plot_resonant:
        filename = 'data/resonant_dataset.pkl'
    elif plot_random:
        filename = 'data/random_dataset.pkl'

    # These are generated by data_from_pkl.py
    loaded_data = pkl.load(open(filename, 'rb'))

    train_ssX = (ssX is None)

    fullX, fully = loaded_data['X'], loaded_data['y']

    if train_all:
        len_random = 17082  #Number of valid random examples (others have NaNs)
        random_data = np.arange(len(fullX)) >= (len(fullX) - len_random)

    # Differentiate megno
    if 'fix_megno' in kwargs and kwargs['fix_megno']:
        idx = [
            i for i, lab in enumerate(loaded_data['labels']) if 'megno' in lab
        ][0]
        fullX[:, 1:, idx] -= fullX[:, :-1, idx]

    if 'include_derivatives' in kwargs and kwargs['include_derivatives']:
        derivative = fullX[:, 1:, :] - fullX[:, :-1, :]
        derivative = np.concatenate((derivative[:, [0], :], derivative),
                                    axis=1)
        fullX = np.concatenate((fullX, derivative), axis=2)

    # Hide fraction of test
    # MAKE SURE WE DO COPIES AFTER!!!!
    if train:
        if train_all:
            remy, finaly, remX, finalX, rem_random, final_random = train_test_split(
                fully,
                fullX,
                random_data,
                shuffle=True,
                test_size=1. / 10,
                random_state=0)
            trainy, testy, trainX, testX, train_random, test_random = train_test_split(
                remy,
                remX,
                rem_random,
                shuffle=True,
                test_size=1. / 10,
                random_state=1)
        else:
            remy, finaly, remX, finalX = train_test_split(fully,
                                                          fullX,
                                                          shuffle=True,
                                                          test_size=1. / 10,
                                                          random_state=0)
            trainy, testy, trainX, testX = train_test_split(remy,
                                                            remX,
                                                            shuffle=True,
                                                            test_size=1. / 10,
                                                            random_state=1)
    else:
        assert not train_all
        remy = fully
        finaly = fully
        testy = fully
        trainy = fully
        remX = fullX
        finalX = fullX
        testX = fullX
        trainX = fullX

    if plot:
        # Use test dataset for plotting, so put it in validation part:
        testX = finalX
        testy = finaly

    if train_ssX:
        if 'power_transform' in kwargs and kwargs['power_transform']:
            ssX = PowerTransformer(method='yeo-johnson')  #Power is best
        else:
            ssX = StandardScaler()  #Power is best

    n_t = trainX.shape[1]
    n_features = trainX.shape[2]

    if train_ssX:
        ssX.fit(trainX.reshape(-1, n_features)[::1539])

    ttrainy = trainy
    ttesty = testy
    ttrainX = ssX.transform(trainX.reshape(-1, n_features)).reshape(
        -1, n_t, n_features)
    ttestX = ssX.transform(testX.reshape(-1, n_features)).reshape(
        -1, n_t, n_features)
    if train_all:
        ttest_random = test_random
        ttrain_random = train_random

    tremX = ssX.transform(remX.reshape(-1, n_features)).reshape(
        -1, n_t, n_features)
    tremy = remy

    train_len = ttrainX.shape[0]
    X = Variable(
        torch.from_numpy(np.concatenate(
            (ttrainX, ttestX))).type(torch.FloatTensor))
    y = Variable(
        torch.from_numpy(np.concatenate(
            (ttrainy, ttesty))).type(torch.FloatTensor))
    if train_all:
        r = Variable(
            torch.from_numpy(np.concatenate(
                (ttrain_random, ttest_random))).type(torch.BoolTensor))

    Xrem = Variable(torch.from_numpy(tremX).type(torch.FloatTensor))
    yrem = Variable(torch.from_numpy(tremy).type(torch.FloatTensor))

    idxes = np.s_[:]
    dataset = torch.utils.data.TensorDataset(X[:train_len, :, idxes],
                                             y[:train_len])
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             pin_memory=True,
                                             num_workers=8)

    # Cut up dataset into only the random or resonant parts.
    # Only needed if plotting OR
    if (not plot) or (not train_all):
        test_dataset = torch.utils.data.TensorDataset(X[train_len:, :, idxes],
                                                      y[train_len:])
    else:
        if plot_random: mask = r
        else: mask = ~r
        print(
            f'Plotting with {mask.sum()} total elements, when plot_random={plot_random}'
        )
        test_dataset = torch.utils.data.TensorDataset(
            X[train_len:][r[train_len:]][:, :, idxes],
            y[train_len:][r[train_len:]])

    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=3000,
                                                  shuffle=False,
                                                  pin_memory=True,
                                                  num_workers=8)

    kwargs['model'].ssX = copy(ssX)

    return dataloader, test_dataloader
コード例 #3
0
# Create a 1D array the same length as yfilled with zeros named yinput
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html
target = np.zeros((len(yorig), 1)).ravel()

# This will set the value of yinput to 1 for the indices of positive which where had true written to them at line 36
target[positive] = 1
total_aggressive = np.sum(positive)
# This is unneeded as yinput was initialised with 0s but I include it for completeness. It will set the value of yinput
# to 0 for the indices of negative that were set to true at line 39
target[negative] = 0
total_even = np.sum(negative)

# look at scaling
if scaler == "power":
    print("Power Transform Scaler")
    X_scaler = PowerTransformer(method='yeo-johnson')
    Xorig = scaleData(Xorig, X_scaler)
elif scaler == "norm":
    print("Normalizer Scaler")
    X_scaler = Normalizer()
    Xorig = scaleData(Xorig, X_scaler)
elif scaler == "robo":
    print("Robust Scaler")
    X_scaler = RobustScaler(copy=True,
                            quantile_range=(25.0, 75.0),
                            with_centering=True,
                            with_scaling=True)
    Xorig = scaleData(Xorig, X_scaler)
elif scaler == "standard":
    print("Standard Scaler")
    X_scaler = StandardScaler(with_mean=True, with_std=True)
コード例 #4
0
# Take only 2 features to make visualization easier
# Feature of 0 has a long tail distribution.
# Feature 5 has a few but very large outliers.

X = X_full[:, [0, 5]]

distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling', StandardScaler().fit_transform(X)),
    ('Data after min-max scaling', MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
     RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after power transformation (Yeo-Johnson)',
     PowerTransformer(method='yeo-johnson').fit_transform(X)),
    ('Data after power transformation (Box-Cox)',
     PowerTransformer(method='box-cox').fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
     QuantileTransformer(output_distribution='uniform').fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
     QuantileTransformer(output_distribution='normal').fit_transform(X)),
    ('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)),
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_full)

# plasma does not exist in matplotlib < 1.5
cmap = getattr(cm, 'plasma_r', cm.hot_r)
コード例 #5
0
ファイル: test_common.py プロジェクト: AnAnteup/icp5
iris = load_iris()


def _get_valid_samples_by_column(X, col):
    """Get non NaN samples in column of X"""
    return X[:, [col]][~np.isnan(X[:, col])]


@pytest.mark.parametrize(
    "est, func, support_sparse, strictly_positive",
    [(MaxAbsScaler(), maxabs_scale, True, False),
     (MinMaxScaler(), minmax_scale, False, False),
     (StandardScaler(), scale, False, False),
     (StandardScaler(with_mean=False), scale, True, False),
     (PowerTransformer('yeo-johnson'), power_transform, False, False),
     (PowerTransformer('box-cox'), power_transform, False, True),
     (QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
     (RobustScaler(), robust_scale, False, False),
     (RobustScaler(with_centering=False), robust_scale, True, False)])
def test_missing_value_handling(est, func, support_sparse, strictly_positive):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
    if strictly_positive:
        X += np.nanmin(X) + 0.1
    X_train, X_test = train_test_split(X, random_state=1)
    # sanity check
コード例 #6
0
ファイル: fundus.py プロジェクト: bilha-analytics/lab_works_2
    ]
    base_data_pipe_post = [
        ('flatten', preprocess.Flattenor()),
    ]

    print(f"\n{c*10} Starting TrainingManager with Grid Search {c*10}\n")
    dpipez = [
        Pipeline(base_data_pipe_pre + [
            ('basic_green', extract.ColorChannelz()),
        ] + base_data_pipe_post + [
            ('scaler', StandardScaler()),
        ]),
        Pipeline(base_data_pipe_pre + [
            ('basic_green', extract.ColorChannelz()),
        ] + base_data_pipe_post + [
            ('power', PowerTransformer()),
        ]),
        ## TODO: recheck size remaps
        # Pipeline( base_data_pipe_pre+[('color_chan', extract.FundusColorChannelz() ),]+base_data_pipe_post +[('scaler', StandardScaler()), ] ),
        # Pipeline( base_data_pipe_pre+[('eigenz_chan', extract.EigenzChannelz(topn=70) ),]+base_data_pipe_post +[('scaler', StandardScaler()), ] ),
        # Pipeline( base_data_pipe_pre+[('patch_chan', extract.PatchifyChannelz(nx_patchez=12) ),]+base_data_pipe_post +[('scaler', StandardScaler()), ] )
    ]

    mpipez = [
        (Pipeline([('flatten', preprocess.Flattenor()), ('svm', svm.SVC())]), {
            'kernel': ('linear', 'rbf'),
            'C': [1, 10]
        }),  ## 
        (Pipeline([('flatten', preprocess.Flattenor()),
                   ('logit', LogisticRegression())]), {
                       'C': [1, 10]
コード例 #7
0
ファイル: pca.py プロジェクト: abhijeetdtu/queencity20
from queencity20.utils.getData import *
from queencity20.utils.remove_correlated import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from collections import defaultdict
df = getTrainingData()
df.head()

allButTarget = [c for c in df.columns if c != "target"]
means = df.mean(skipna=True)
fdf = df.fillna(means)

from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
fdf.loc[:,allButTarget] = pt.fit_transform(fdf.loc[:,allButTarget])

fdf = diffCols(fdf)

#corcols = list(set(find_correlation(fdf.drop("target" , axis=1), threshold=0.9)))
#fdf = fdf.drop(corcols , axis=1)

#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X = scaler.fit_transform(X)

X = fdf.drop(["target"], axis=1)
fdf = fdf[~np.any(np.logical_or(X > 2.5 , X < -2.5) , axis=1)]
X = fdf.drop(["target"], axis=1)
y = fdf["target"]
コード例 #8
0
def power_transform(data):
    pt = PowerTransformer(standardize=False)
    return pt.fit_transform(data)
コード例 #9
0
ファイル: data_cleaning_flow.py プロジェクト: crawftv/crawto
def fit_yeo_johnson_transformer(train_imputed_numeric_df: pd.DataFrame):
    yeo_johnson_transformer = PowerTransformer(method="yeo-johnson", copy=True)
    yeo_johnson_transformer.fit(train_imputed_numeric_df)
    return yeo_johnson_transformer
コード例 #10
0
    list_symmetry_score.append(symmetry_score)
    list_colour_scores.append(colour_score)

df = pd.DataFrame({
    "Asymmetry score": list_symmetry_score,
    "Border score": list_border_score,
    "Colour score": list_colour_scores
})

# Predict labels for each fold using the KNN algortihm
X = X_full = df.to_numpy()  # Vanaf 2 zodat melanoma info niet wordt meegenomen
# Load labels
control_group = np.array(control_group)
y = control_group
distributions = [('Data after power transformation (Box-Cox)',
                  PowerTransformer(method='box-cox').fit_transform(X))]

for i in distributions:

    titel = i[0]
    methode = i[0]

    X = i[1]
    X[:, 0] = X[:, 0] * 1.6
    X[:, 1] = X[:, 1] * 2.0
    X[:, 2] = X[:, 2] * 2.0
    symmetrie = X[:, 0]
    kleur = X[:, 2]
    border = X[:, 1]
    kf = StratifiedShuffleSplit(n_splits=1, test_size=0.4)
    list_border_score.append(border_score)
コード例 #11
0
def predefined_ops():
    '''return dict of user defined none-default instances of operators
    '''
    clean = {
        'clean':
        Cleaner(dtype_filter='not_datetime',
                na1='null',
                na2='mean',
                drop_uid=True),
        'cleanNA':
        Cleaner(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'),
        'cleanMn':
        Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'),
    }
    #
    encode = {
        'woe8': WoeEncoder(max_leaf_nodes=8),
        'woe5': WoeEncoder(max_leaf_nodes=5),
        'woeq8': WoeEncoder(q=8),
        'woeq5': WoeEncoder(q=5),
        'woeb5': WoeEncoder(bins=5),
        'woem': WoeEncoder(mono=True),
        'oht': OhtEncoder(),
        'ordi': OrdiEncoder(),

        # 'bin10': BinEncoder(bins=10, int_bins=True),  # 10 bin edges encoder
        # 'bin5': BinEncoder(bins=5, int_bins=True),  # 5 bin edges encoder
        # 'binm10': BinEncoder(max_leaf_nodes=10,
        #                      int_bins=True),  # 10 bin tree cut edges encoder
        # 'binm5': BinEncoder(max_leaf_nodes=5,
        #                     int_bins=True),  # 5 bin tree cut edges encoder
    }

    resample = {
        # over_sampling
        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),
        # clean outliers
        'inlierForest':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'IsolationForest',
                            'contamination': 0.1
                        }),
        'inlierLocal':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'LocalOutlierFactor',
                            'contamination': 0.1
                        }),
        'inlierEllip':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'EllipticEnvelope',
                            'contamination': 0.1
                        }),
        'inlierOsvm':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'OneClassSVM',
                            'contamination': 0.1
                        }),
    }

    scale = {
        'stdscale': StandardScaler(),
        'minmax': MinMaxScaler(),
        'absmax': MaxAbsScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'quantile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        # kernel approximation
        'Nys': Nystroem(random_state=0),
        'rbf': RBFSampler(random_state=0),
        'rfembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(WoeEncoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(LogisticRegression(penalty='l1', solver='saga',
                                           C=1e-2)),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fxgb':
        SelectFromModel(
            XGBClassifier(n_jobs=-1,
                          booster='gbtree',
                          max_depth=2,
                          n_estimators=50), ),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)),

        # fixed number of features
        'fxgb20':
        SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'),
                        max_features=20),
        'frf20':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=20),
        'frf10':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=10),
        'fRFElog':
        RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1),
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }

    imp = {
        "impXGB":
        XGBClassifier(n_jobs=-1,
                      booster='gbtree',
                      max_depth=2,
                      n_estimators=50),
        "impRF":
        ExtraTreesClassifier(n_estimators=100, max_depth=2)
    }

    instances = {}
    instances.update(**clean, **encode, **scale, **feature_c, **feature_m,
                     **feature_u, **resample, **imp)
    return instances
コード例 #12
0
def produce_smoted():

    sample_map = {
        1: 500,
        2: 500,
        3: 500,
        4: 500,
        5: 500,
        6: 500,
        7: 500,
        8: 500,
        9: 500,
        10: 500
    }

    # Read data
    X = pd.read_csv('train_data.csv', header=None)
    y = pd.read_csv('train_labels.csv', header=None)

    # Combine for shuffling and partitioning
    data = pd.concat([y, X], axis='columns', ignore_index=True)

    # Shuffle for more reliable validation later
    data = data.sample(frac=1).reset_index(drop=True)

    # Let's partition. 1st part is used to train with SMOTE, 2nd (smaller) part is used to validate
    train, test = train_test_split(data, test_size=0.3, random_state=0)

    # Find x & y
    x_train = train.drop(labels=0, axis='columns')
    y_train = train[[0]]

    x_test = test.drop(labels=0, axis='columns')
    y_test = test[[0]]

    # Let's try SMOTE
    X_resampled, y_resampled = BorderlineSMOTE().fit_resample(x_train, y_train)
    X_resampled = pd.DataFrame(X_resampled)
    y_resampled = pd.DataFrame(y_resampled)

    training_data = pd.concat(
        [y_resampled, X_resampled], axis='columns',
        ignore_index=True).sample(frac=1).reset_index(drop=True)

    # Rhythm patterns
    rhythm = training_data.iloc[:, 1:169].copy()

    # Chroma
    chroma_cleaned = training_data.iloc[:, 169:205]

    # MFCCs
    mfcc_cleaned = training_data.iloc[:, 221:265].copy()

    cleaned_x_training = pd.concat([rhythm, chroma_cleaned, mfcc_cleaned],
                                   axis='columns',
                                   ignore_index=True)

    # Outlier detection
    threshold = 3
    for col in range(cleaned_x_training.shape[1]):
        mean = np.mean(cleaned_x_training.iloc[:, col])
        z = np.abs(stats.zscore(cleaned_x_training.iloc[:, col]))
        rows = np.where(z > threshold)
        for row in rows:
            cleaned_x_training.at[row, col] = mean

    # Scaling
    scaler = PowerTransformer()
    scaled_data = scaler.fit_transform(cleaned_x_training)
    scaled_x_training = pd.DataFrame(scaled_data)

    # NOW SAME OPERATIONS FOR VALIDATION DATA
    validation_data = pd.concat(
        [y_test, x_test], axis='columns',
        ignore_index=True).sample(frac=1).reset_index(drop=True)

    # Rhythm patterns
    rhythm = validation_data.iloc[:, 1:169].copy()

    # Chroma
    chroma_cleaned = validation_data.iloc[:, 169:193]

    # MFCCs
    mfcc_cleaned = validation_data.iloc[:, 221:265].copy()

    cleaned_x_validation = pd.concat([rhythm, chroma_cleaned, mfcc_cleaned],
                                     axis='columns',
                                     ignore_index=True)

    # Outlier detection
    threshold = 3
    for col_val in range(cleaned_x_validation.shape[1]):
        mean_val = np.mean(cleaned_x_validation.iloc[:, col_val])
        z_val = np.abs(stats.zscore(cleaned_x_validation.iloc[:, col_val]))
        rows_val = np.where(z_val > threshold)
        for row_val in rows_val:
            cleaned_x_validation.at[row_val, col_val] = mean_val

    # Scaling
    scaler = PowerTransformer()
    scaled_data = scaler.fit_transform(cleaned_x_validation)
    scaled_x_validation = pd.DataFrame(scaled_data)

    return scaled_x_training, scaled_x_validation, training_data[[
        0
    ]], validation_data[[0]]
コード例 #13
0
    df_final = pd.concat([x, y], axis=1)
    df_final = df_final.rename(columns={f'internet_traffic_{cell_id}': 'y'})
    df_final['y_sh'] = df_final['y'].shift(periods=-1)
    df_final = df_final.dropna()

# SPLITTING AND PREPARING DATASET
size = len(df_final)

X_train = df_final.drop(['y_sh'], axis=1)[:int(0.8 * size)]
y_train = df_final['y_sh'][:int(0.8 * size)]

X_test = df_final.drop(['y_sh'], axis=1)[int(0.8 * size):]
y_test = df_final['y_sh'][int(0.8 * size):]

scaler = PowerTransformer()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

n_features = X_train.shape[1]

# PREPARING DATA TO INPUT INTO NNET
X_train = X_train.reshape(-1, 1, n_features)
X_test = X_test.reshape(-1, 1, n_features)

# %%
file_path = f'C:\\Users\\patri\\Documents\\Github\\milan-telecom-analysis\\results\\model_results\\{neighorrs}_neighbors_id_{cell_id}.h5'
network = keras.models.load_model(file_path,
コード例 #14
0
 def __init__(self, name='YeoJohnson'):
     super().__init__(name)
     self.inplace = True
     self.power = PowerTransformer(method='yeo-johnson', standardize=False)
コード例 #15
0
ファイル: teste.py プロジェクト: devscheffer/desafio-2-2020
# %%
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# scaler = [StandardScaler()]
classifier_test = [SVC()]

#=================Scaler
scaler = [
    StandardScaler(),
    MinMaxScaler(),
    MaxAbsScaler(),
    RobustScaler(quantile_range=(25, 75)),
    PowerTransformer(method='yeo-johnson'),
    # PowerTransformer(method='box-cox'),
    QuantileTransformer(output_distribution='normal'),
    QuantileTransformer(output_distribution='uniform'),
    Normalizer()
]

# %%

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    def __init__(self,
                 preprocess_type=None,
                 extend_data=False,
                 short_end=False):

        self.config = Config()
        # prepare input data
        config_path = self.config.get_filepath("", "config.yaml")

        config_file = open(config_path, 'r')
        yaml_config = yaml.load(config_file, Loader=yaml.SafeLoader)

        self.training_dataset_names = [
            d['name'] for d in yaml_config['training_datasets']
        ]
        self.training_dataset_start_pos = [
            d['start_position'] for d in yaml_config['training_datasets']
        ]
        self.test_dataset_names = [
            d['name'] for d in yaml_config['test_datasets']
        ]
        self.test_dataset_start_pos = [
            d['start_position'] for d in yaml_config['test_datasets']
        ]
        self.dataset_names = np.concatenate(
            (self.training_dataset_names,
             self.test_dataset_names))  # do we need these?
        self.dataset_start_pos = np.concatenate(
            (self.training_dataset_start_pos,
             self.test_dataset_start_pos))  # do we need these?

        # read in all pickle files
        self.all_pd = []
        for dataset_name in self.dataset_names:
            self.all_pd.append(
                pd.read_pickle(self.config.get_filepath_data(dataset_name)))

        if extend_data:
            training_dataset_names_copy = np.array(self.training_dataset_names,
                                                   copy=True)

            # create a copy of the data shifted up by 10
            for i, dataset_name in enumerate(training_dataset_names_copy):
                self.dataset_names = np.append(self.dataset_names,
                                               dataset_name + "_" + str(10))
                self.training_dataset_names = np.append(
                    self.training_dataset_names, dataset_name + "_" + str(10))
                self.dataset_start_pos = np.append(
                    self.dataset_start_pos, self.training_dataset_start_pos[i])
                self.training_dataset_start_pos.append(
                    self.training_dataset_start_pos[i])
                self.all_pd.append(self.all_pd[i].copy() + 10)

        self.dict_datasets = dict(
            zip(self.dataset_names, np.arange(len(self.dataset_names))))

        self.enable_difference = False

        self._feature_range = [0, 1]
        self.normalisation_scalers = []
        for _ in self.dataset_names:
            self.normalisation_scalers.append(
                MinMaxScaler(feature_range=self.feature_range))

        self.enable_normalisation_scaler = False
        self.enable_ignore_price = False  # scale each curve to feature_range

        self.power_transformer = PowerTransformer()
        self.enable_power_transform = False

        self.standardisation_scalers = []
        for _ in self.dataset_names:
            self.standardisation_scalers.append(StandardScaler())

        self.enable_standardisation_scaler = False

        self.enable_log_returns = False
        self.mult_factor = 10  # 5
        self.add_factor = 25  # 6

        self.enable_log = False
        self.enable_pct_change = False

        self.enable_curve_smoothing = False

        self.short_end = short_end

        # now setup PreprocessType settings
        if preprocess_type is PreprocessType.NORMALISATION_OVER_TENORS:
            self.enable_normalisation_scaler = True
            self.feature_range = [0, 1]
        elif preprocess_type is PreprocessType.NORMALISATION_OVER_CURVES:
            self.enable_normalisation_scaler = True
            self.feature_range = [0, 1]
            self.enable_ignore_price = True
        elif preprocess_type is PreprocessType.STANDARDISATION_OVER_TENORS:
            self.enable_standardisation_scaler = True
        elif preprocess_type is PreprocessType.LOG_RETURNS_OVER_TENORS:
            self.enable_log_returns = True
コード例 #17
0
def gaussian_scaler(train, test, method='yeo-johnson'):
    scaler = PowerTransformer(method, standardize=False, copy=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled
コード例 #18
0
def chang_hug_map(X, hex_colors, FONT_SIZE=12, BINS=30):
    '''
    Function that applies Chang & Hug map of preprocessing data to a normal distribution:
    REF: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py
    
    Parameters:
    * X = features
    * hex_colors = hexadecimal colors to be used for each feature
    * FONT_SIZE = size of font on plots
    * BINS = number of bins on histogram plots
    '''
    # setting preprocessing methods: PowerTransformer (Box-Cox, Yeo-Johnson); QuantileTransformer
    scaler = MinMaxScaler(feature_range=(1, 2))
    boxcox = PowerTransformer(method='box-cox')
    bc = Pipeline(steps=[('s', scaler), ('bc', boxcox)])

    yj = PowerTransformer(method='yeo-johnson')

    rng = np.random.RandomState(304)
    qt = QuantileTransformer(n_quantiles=500,
                             output_distribution='normal',
                             random_state=rng)

    # adding distributions of columns
    distributions = []
    for i in range(0, len(X.columns)):
        name = X.columns[i]
        array = X[X.columns[i]].to_numpy().reshape(-1, 1)
        distributions.append((name, array))

    colors = hex_colors

    # generating the plot
    fig, axes = plt.subplots(
        nrows=12, ncols=15,
        figsize=(35, 25))  # cols = num of preprocessing methods + original
    axes = axes.flatten()
    axes_idxs = [
        (0, 15, 30, 45),
        (1, 16, 31, 46),
        (2, 17, 32, 47),
        (3, 18, 33, 48),
        (4, 19, 34, 49),
        (5, 20, 35, 50),  # first set
        (6, 21, 36, 51),
        (7, 22, 37, 52),
        (8, 23, 38, 53),
        (9, 24, 39, 54),
        (10, 25, 40, 55),
        (11, 26, 41, 56),
        (12, 27, 42, 57),
        (13, 28, 43, 58),
        (14, 29, 44, 59),
        (60, 75, 90, 105),
        (61, 76, 91, 106),
        (62, 77, 92, 107),
        (63, 78, 93, 108),
        (64, 79, 94, 109),
        (65, 80, 95, 110),  # second set
        (66, 81, 96, 111),
        (67, 82, 97, 112),
        (68, 83, 98, 113),
        (69, 84, 99, 114),
        (70, 85, 100, 115),
        (71, 86, 101, 116),
        (72, 87, 102, 117),
        (73, 88, 103, 118),
        (74, 89, 104, 119),
        (120, 135, 150, 165),
        (121, 136, 151, 166),
        (122, 137, 152, 167),
        (123, 138, 153, 168),
        (124, 139, 154, 169),
        (125, 140, 155, 170),
        (126, 141, 156, 171),
        (127, 142, 157, 172),
        (128, 143, 158, 173),
        (129, 144, 159, 174),
        (130, 145, 160, 175),
        (131, 146, 161, 176),
        (132, 147, 162, 177),
        (133, 148, 163, 178),
        (134, 149, 164, 179)
    ]

    axes_list = [(axes[i], axes[j], axes[k], axes[l])
                 for (i, j, k, l) in axes_idxs]

    for distribution, color, axes in zip(distributions, colors, axes_list):
        name, X_col = distribution
        X_train, X_test = train_test_split(X_col,
                                           test_size=0.2,
                                           random_state=rng)

        # perform power and quantile transforms
        X_trans_bc = bc.fit(X_train).transform(X_test)
        lmbda_bc = round(bc.named_steps['bc'].lambdas_[0], 2)
        X_trans_yj = yj.fit(X_train).transform(X_test)
        lmbda_yj = round(yj.lambdas_[0], 2)
        X_trans_qt = qt.fit(X_train).transform(X_test)

        ax_original, ax_bc, ax_yj, ax_qt = axes

        ax_original.hist(X_train, color=color, bins=BINS)
        ax_original.set_title(name, fontsize=FONT_SIZE)
        ax_original.tick_params(axis='both',
                                which='major',
                                labelsize=FONT_SIZE)

        for ax, X_trans, meth_name, lmbda in zip(
            (ax_bc, ax_yj, ax_qt), (X_trans_bc, X_trans_yj, X_trans_qt),
            ('Box-Cox', 'Yeo-Johnson', 'Quartile transform'),
            (lmbda_bc, lmbda_yj, None)):
            ax.hist(X_trans, color=color, bins=BINS)
            title = f'After {meth_name}'
            if lmbda is not None:
                title += f'\n$\lambda$ = {lmbda}'
            ax.set_title(title, fontsize=FONT_SIZE)
            ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
            ax.set_xlim([-3.5, 3.5])

    # Setting last plot as empty
    for i in range(-10, 0):
        ax_original, ax_bc, ax_yj, ax_qt = axes_list[i]
        ax_original.axis('off')
        ax_bc.axis('off')
        ax_yj.axis('off')
        ax_qt.axis('off')

    # Export and last adjustments
    plt.tight_layout()
    plt.savefig('fig/09_col_trf.png')
    plt.show()
コード例 #19
0
train_x = dataset_train[:, data_train.columns.isin(variables_x)]
train_y = dataset_train[:, data_train.columns.isin(variables_y)].reshape(-1)
test_x = dataset_test[:, data_train.columns.isin(variables_x)]
test_y = dataset_test[:, data_train.columns.isin(variables_y)].reshape(-1)

#
# train_x = StandardScaler().fit_transform(train_x)
# test_x = StandardScaler().fit_transform(test_x)

# train_x = MinMaxScaler().fit_transform(train_x)
# test_x = MinMaxScaler().fit_transform(test_x)
# train_x = QuantileTransformer().fit_transform(train_x)
# test_x = QuantileTransformer().fit_transform(test_x)
#
train_x = PowerTransformer().fit_transform(train_x)
test_x = PowerTransformer().fit_transform(test_x)

#
myFile = open('../data/power_m_o_test_x.csv', 'w')
with myFile:
    writer = csv.writer(myFile)
    writer.writerows(test_x)

myFile2 = open('../data/power_m_o_train_x.csv', 'w')
with myFile2:
    writer2 = csv.writer(myFile2)
    writer2.writerows(train_x)


# train_x = PowerTransformer().fit_transform(train_x)
コード例 #20
0
iris = load_iris()


def _get_valid_samples_by_column(X, col):
    """Get non NaN samples in column of X"""
    return X[:, [col]][~np.isnan(X[:, col])]


@pytest.mark.parametrize(
    "est, func, support_sparse, strictly_positive, omit_kwargs",
    [
        (MaxAbsScaler(), maxabs_scale, True, False, []),
        (MinMaxScaler(), minmax_scale, False, False, ["clip"]),
        (StandardScaler(), scale, False, False, []),
        (StandardScaler(with_mean=False), scale, True, False, []),
        (PowerTransformer("yeo-johnson"), power_transform, False, False, []),
        (PowerTransformer("box-cox"), power_transform, False, True, []),
        (QuantileTransformer(n_quantiles=10), quantile_transform, True, False,
         []),
        (RobustScaler(), robust_scale, False, False, []),
        (RobustScaler(with_centering=False), robust_scale, True, False, []),
    ],
)
def test_missing_value_handling(est, func, support_sparse, strictly_positive,
                                omit_kwargs):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
コード例 #21
0
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
from xgboost import XGBRFClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

t = np.array(list(df['creatinine_phosphokinase'])).reshape(-1, 1)
pt = PowerTransformer(method="yeo-johnson")
creatinine_phosphokinase = pt.fit_transform(t)
df['creatinine_phosphokinase'] = creatinine_phosphokinase

t = np.array(list(df['serum_creatinine'])).reshape(-1, 1)
pt = PowerTransformer(method="yeo-johnson")
serum_creatinine = pt.fit_transform(t)
df['serum_creatinine'] = serum_creatinine

df.drop(columns=['sex', 'diabetes'], inplace=True)
X = df.iloc[:, 0:10].values
Y = df['DEATH_EVENT'].values

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=6)

xrclf = XGBRFClassifier()
xrclf.fit(x_train, y_train)
コード例 #22
0
 def __init__(self):
     self.pt = PowerTransformer()
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9566871315015497
exported_pipeline = make_pipeline(
    PowerTransformer(), StandardScaler(), RobustScaler(),
    GaussianProcessRegressor(kernel=Matern(length_scale=4.0, nu=2.5),
                             n_restarts_optimizer=185,
                             normalize_y=False))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
コード例 #24
0
'''
Log transformation
In the previous exercises you scaled the data linearly, which will not affect the data's shape. This works great if your data is normally distributed (or closely normally distributed), an assumption that a lot of machine learning models make. Sometimes you will work with data that closely conforms to normality, e.g the height or weight of a population. On the other hand, many variables in the real world do not follow this pattern e.g, wages or age of a population. In this exercise you will use a log transform on the ConvertedSalary column in the so_numeric_df DataFrame as it has a large amount of its data centered around the lower values, but contains very high values also. These distributions are said to have a long right tail.

Instructions
100 XP
Import PowerTransformer from sklearn's preprocessing module.
Instantiate the PowerTransformer() as pow_trans.
Fit the PowerTransformer on the ConvertedSalary column of so_numeric_df.
Transform the same column with the scaler you just fit.
'''
SOLUTION

# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer

# Instantiate PowerTransformer
pow_trans = PowerTransformer()

# Train the transform on the data
pow_trans.fit(so_numeric_df[['ConvertedSalary']])

# Apply the power transform to the data
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(
    so_numeric_df[['ConvertedSalary']])

# Plot the data before and after the transformation
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
plt.show()
コード例 #25
0
ファイル: model.py プロジェクト: bilha-analytics/lab_works_2
    yhat = model3.transform(tmpX[n_train:], tmpY[n_train:])
    model3.zscore(yhat, tmpY[n_train:])

    print(f"{c*10} End Model3 <-- new, hyperparams {c*10}\n")
    # ## TODO: classifier/regressor/clusterer/etc Mixin requirements
    # piper = Pipeline(['model', model2])
    # print( piper )

    # piper.fit_transform(tmpX, tmpY)


    print(f"\n{c*10} Starting TrainingManager with Grid Search {c*10}\n")
    import preprocess, extract 
    from sklearn.preprocessing import StandardScaler, PowerTransformer
    from sklearn.linear_model import LogisticRegression
    from sklearn import svm 

    dpipez = [Pipeline([('scaler', StandardScaler()), ]),  
                Pipeline([('power', PowerTransformer()),])
                ]
    mpipez = [ ( Pipeline([ ('flatten', preprocess.Flattenor()), ('svm', svm.SVC() ) ]), {'kernel':('linear', 'rbf'), 'C':[1, 10]}) ,  ## 
                ( Pipeline([ ('flatten', preprocess.Flattenor()),('logit', LogisticRegression() ) ]), {'C':[1,10]} ), ##
                (Pipeline([('reshaper', preprocess.Reshapeor( (1, -1)) ), ('tensorfy', preprocess.ToTensor() ),('zmodel', model2)]), {}) 
             ] #*tmpX[0].shape

    print( mpipez)

    mgr = ZTrainingManager() 
    mgr.build_permutationz(data_pipez=dpipez, model_pipez=mpipez)
    mgr.run( [x.cpu().numpy().ravel() for x in tmpX], [y.cpu().numpy().ravel() for y in tmpY] , train_test_split=1.)
    print(f"{c*10} End ZTrainingManager {c*10}\n")
コード例 #26
0
def gaussian_scaler(train, test):
    scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train)

    train_scaled, test_scaled = transform_scaler(train, test, scaler)

    return train_scaled, test_scaled, scaler
コード例 #27
0
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PowerTransformer, minmax_scale

print(__doc__)


N_SAMPLES = 3000
FONT_SIZE = 6
BINS = 100


pt = PowerTransformer(method='box-cox')
rng = np.random.RandomState(304)
size = (N_SAMPLES, 1)


# lognormal distribution
X_lognormal = rng.lognormal(size=size)

# chi-squared distribution
df = 3
X_chisq = rng.chisquare(df=df, size=size)

# weibull distribution
a = 50
X_weibull = rng.weibull(a=a, size=size)
コード例 #28
0

def transform(cols, cols_to_transform, scaler):
    values = scaler.transform(cols)
    return df[['name', 'alignment'
               ]].join(pd.DataFrame(values, columns=cols_to_transform))


scalers = [
    StandardScaler(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
    MinMaxScaler(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
    RobustScaler(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
    PowerTransformer(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
    Normalizer(
    ),  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html
]

_df = df[['name', 'alignment', 'weight', 'height']].dropna()
cols_to_transform = ['weight', 'height']
df_to_scale = _df[cols_to_transform]

for scaler in scalers:
    scaled_values = scaler.fit_transform(df_to_scale)
    scaled_values = pd.DataFrame(scaled_values, columns=cols_to_transform)
    df_transformed = _df[['name', 'alignment']].join(scaled_values)
    plot_weight_vs_height(df_transformed, str(scaler.__class__.__name__))

# -
コード例 #29
0
def to_normal(train, test, features, method="yeo-johnson"):
    # method can be box-cox
    pt = PowerTransformer(method=method)
    train[features] = pt.fit_transform(train[features])
    test[features] = pt.transform(test[features])
    return train, test
コード例 #30
0
    def post(self, init_dataset=init_dataset):
        """
        POST HTTP request
        @param init_dataset: Init dataset loaded
        @return: 200 code with history of the fit
        """
        start_time = time.time()

        # Init the main parameters of training
        test_size = request.args.get('test_size', default=0.2, type=float)
        batch_size = request.args.get('batch_size', default=512, type=int)
        epochs = request.args.get('epochs', default=20, type=int)
        frac = request.args.get('frac', default=1, type=float)

        # Test of frac feature
        if not 1 >= frac >= 0.0001:
            return ioObj.generic_err400_with_resp(
                'Wrong format of frac feature. Please, provide a float number in (0,1]', start_time)
        LOG.info("Loaded formatted data",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})

        # Test of frac feature
        if not 1 >= test_size >= 0.0001:
            return ioObj.generic_err400_with_resp(
                'Wrong format of test_size feature. Please, provide a float number in (0,1]', start_time)
        LOG.info("Loaded formatted data",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})

        # Fetch frac of the dataset
        frac_dataset = init_dataset.sample(frac=frac)

        # Train test split
        X_train, X_test, y_train, y_test = train_test_split(frac_dataset.iloc[:, 3:-1],
                                                            frac_dataset.iloc[:, -1],
                                                            test_size=test_size)

        LOG.info("Train test split",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})

        # Init of two PowerTransformer objects
        model_new.def_scaler(PowerTransformer(), PowerTransformer())

        # Fit on X_train and transform both test and train samples
        X_train = model_new.scalerX.fit_transform(X_train)
        X_test = model_new.scalerX.transform(X_test)

        # Fit on y_train and transform both test and train samples
        y_train = model_new.scalerY.fit_transform(y_train.to_numpy().reshape(-1, 1))
        y_test = model_new.scalerY.transform(y_test.to_numpy().reshape(-1, 1))

        LOG.info("Transformed input and output data",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})

        # Set the model regressor and compile
        regressor = model_new.model
        regressor.compile(optimizer='adam', loss='mse', metrics=['mae', model_new.coeff_determination])

        LOG.info("Compiled the new model and about to fit it",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})

        # Fit regressor
        history = regressor.fit(
            X_train,
            y_train,
            batch_size=batch_size,
            epochs=epochs,
            callbacks=[early_stopping],
            validation_data=(X_test, y_test)
        )

        LOG.info("Fitted model. Output the results",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})

        # Format the results
        final_results = {}
        for key in history.history.keys():
            final_results[key] = history.history[key]

        # Serialize format of model, weights, and PowerTransformers
        with open(mdl_new_name, "w") as json_file:
            json_file.write(regressor.to_json())
        regressor.save_weights(mdl_new_weights_filename)
        pickle.dump(model_new.scalerX, open(scaler_new_X_filename, 'wb'))
        pickle.dump(model_new.scalerY, open(scaler_new_Y_filename, 'wb'))

        # Load files into MongoDB
        new_model_db.load_local_files()

        LOG.info("Loaded files of model, weights and PowerTransformers. Outputting the results",
                 extra={"status": 200, "time_elapsed": "%.3f seconds" % (time.time() - start_time)})
        return ioObj.generic_resp(200, 'application/json',
                                  ioObj.json_d(ioObj.success_message(final_results)))