コード例 #1
0
ファイル: utils.py プロジェクト: Billy-Bat/K_HousePrice
def pd_fixskew(Data, tresh=0.5, mthd='box-cox', exclude=[], return_lambda=False):
    """
    if data contains zero the boxcox is applied with shift of epsilon
    """
    skew_res = Data.skew()
    f_cols = np.empty(shape=Data.shape)
    transformer = []
    for i, col in enumerate(Data.columns) :
        if col in exclude :
            f_cols[:,i] = Data[col]
        else :
            array_col = np.reshape(Data[col].values, newshape=(len(Data[col]), 1))
            try :
                trnsfm = PowerTransformer(method=mthd, standardize=True)
                f_col = trnsfm.fit_transform(array_col)
                f_cols[:,i] = np.reshape(f_col, newshape=(len(Data[col],)))
                transformer.append(trnsfm)
            except :
                print('WARNING : {} failed on {} passing to yeo-johnson'.format(mthd, col))
                trnsfm = PowerTransformer(method='yeo-johnson', standardize=True)
                f_col = trnsfm.fit_transform(array_col)
                f_cols[:,i] = np.reshape(f_col, newshape=(len(Data[col],)))
                transformer.append(trnsfm)

    Data_skewFixed = pd.DataFrame(f_cols, index=Data.index, columns=Data.columns)
    if return_lambda :
        return Data_skewFixed, transformer
    else :
        return Data_skewFixed
def class_model(df):
    #Data splitting train and test Data
    x = df.drop('bad_loan', axis=1)
    y = df.bad_loan
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    #Feature Scaling
    PT = PowerTransformer()
    x_train = PT.fit_transform(x_train)
    x_test = PT.fit_transform(x_test)

    #LogisticRegression classification Model without cross Validation
    log = LogisticRegression()
    log.fit(x_train, y_train)
    log_pred = log.predict(x_test)

    log_accuracy = metrics.accuracy_score(y_test, log_pred)
    print("Accuracy: ", log_accuracy)

    log_precision = metrics.precision_score(y_test, log_pred, pos_label=0)
    print("Precision: ", log_precision)

    log_recall = metrics.recall_score(y_test, log_pred, pos_label=0)
    print("Recall: ", log_recall)

    log_f1_score = metrics.f1_score(y_test, log_pred, pos_label=0)
    print("F1 Score: ", log_f1_score)

    print("Confusion Matrix:\n", confusion_matrix(y_test, log_pred))
    print("Classification Report:\n", classification_report(y_test, log_pred))

    #LogisticRegression classification Model with cross Validation
    PT1 = PowerTransformer()
    x = PT1.fit_transform(x)

    log_cross_val = cross_val_score(log, x, y, cv=10, scoring='accuracy')
    print('Classification Results with cross validation::')
    log_cv_accuracy = log_cross_val.mean()
    print("Accuracy: ", log_cv_accuracy)

    log_cross_val_pre = cross_val_score(log,
                                        x,
                                        y,
                                        cv=10,
                                        scoring='precision_macro')
    log_cv_precision = log_cross_val_pre.mean()
    print("Precision: ", log_cv_precision)

    log_cross_val_re = cross_val_score(log,
                                       x,
                                       y,
                                       cv=10,
                                       scoring='recall_macro')
    log_cv_recall = log_cross_val_re.mean()
    print("Recall: ", log_cv_recall)

    log_cross_val_f1 = cross_val_score(log, x, y, cv=10, scoring='f1_macro')
    log_cv_f1_score = log_cross_val_f1.mean()
    print("F1 Score: ", log_cv_f1_score)
コード例 #3
0
def normalize_by_category(data):
    ID_dict = split_data_by_identifier(data)
    for key, item in zip(ID_dict.keys(), ID_dict.values()):
        item = item[((item.success_score - item.success_score.mean()) /
                     item.success_score.std()).abs() < 3]
        pt = PowerTransformer()
        pt.fit_transform(item[['success_score', 'income_bracket']])
        item['success_score'] = (
            item.success_score -
            item.success_score.mean()) / item.success_score.std(ddof=0)
        item['success_score'] = map_transform(item['success_score'], 0, 10)
        ID_dict[key] = item
    chart = pd.concat(ID_dict.values())
    return chart
コード例 #4
0
class Target_Transformation(BaseEstimator, TransformerMixin):
    def __init__(self, target, function_to_apply='bc'):
        self.target = target
        self.function_to_apply = function_to_apply
        if self.function_to_apply == 'bc':
            self.function_to_apply = 'box-cox'
        else:
            self.function_to_apply = 'yeo-johnson'

    def fit(self, dataset, y=None):
        return None

    def transform(self, dataset, y=None):
        return dataset

    def fit_transform(self, dataset, y=None):
        data = dataset.copy()

        # if target column has zero or negative values then auto use yj method
        if any(data[self.target] <= 0):
            self.function_to_apply = 'yeo-johnson'
        self.p_transform_target = PowerTransformer(
            method=self.function_to_apply)

        data[self.target] = self.p_transform_target.fit_transform(
            np.array(data[self.target]).reshape(-1, 1))

        return data
コード例 #5
0
def to_gaussian(data, submission_feat, exclude, gauss):

    if gauss != "no":
        from sklearn.preprocessing import PowerTransformer

        df = data.copy()
        features = list(
            set(
                df.select_dtypes(include=[
                    "uint8", "int16", "int32", "int64", "float16", "float32",
                    "float64"
                ]).columns) - set(exclude) -
            set([feat.upper() for feat in submission_feat]))
        no_action = list(set(df.columns) - set(features))
        pt = PowerTransformer(method='yeo-johnson')
        values = pt.fit_transform(df[features])
        df_gaussian = pd.DataFrame(data=values,
                                   columns=df[features].columns,
                                   index=df[features].index)
        df_gaussian[no_action] = df[no_action]
    else:
        df_gaussian = data.copy()
        pt = "No PowerTransformer applied"
        features = pt
    gc.collect()
    return df_gaussian, pt, features
コード例 #6
0
def preprocessing(n_clicks, preprocessings):
    global use_df
    global processed_df
    global cat_cols
    global num_cols
    if n_clicks == 0:
        return html.H5('')
    elif target_column is None:
        return html.H5('先に目的変数を指定してください。')
    elif preprocessings is None or preprocessings == []:
        processed_df = use_df.copy()
        return html.H5('前処理は行われていません')
    else:
        text = []
        processed_df = use_df.copy()
        if 'FN' in str(preprocessings):
            processed_df = processed_df.fillna(processed_df.mean())
            text.append('欠損値補完')
        if 'YJ' in str(preprocessings):
            yj = PowerTransformer(method='yeo-johnson')
            processed_df[num_cols] = yj.fit_transform(processed_df[num_cols])
            text.append('Yeo-Johnson変換')
        if 'SS' in str(preprocessings):
            ss = StandardScaler()
            processed_df[num_cols] = ss.fit_transform(processed_df[num_cols])
            text.append('標準化')
        if 'OE' in str(preprocessings):
            oe = ce.OneHotEncoder(cols=cat_cols, handle_unknown='impute')
            processed_df = oe.fit_transform(processed_df)
            text.append('One-Hot Encoding')
        return html.H5('{}を行いました。'.format(text))
コード例 #7
0
def plot_obs_umaps(h5ad, obs_cols, clip=True, normalize=True):
    """
    TODO: revise the normalization! -> use vmin and vmax from scpy
    :param h5ad:
    :param obs_cols:
    :param clip:
    :param normalize:
    :return:
    """

    if clip:
        stdsc = StandardScaler().fit(h5ad.obs[obs_cols].values)

        h5ad.obs[obs_cols] = h5ad.obs[obs_cols].mask(stdsc.transform(h5ad.obs[obs_cols].values) < -3)
        for c in obs_cols:
            h5ad.obs[c].fillna(-3 * h5ad.obs[c].std())
        h5ad.obs[obs_cols] = h5ad.obs[obs_cols].mask(stdsc.transform(h5ad.obs[obs_cols].values) > 3)
        for c in obs_cols:
            h5ad.obs[c].fillna(3 * h5ad.obs[c].std())

    if normalize:
        pete = PowerTransformer(method='yeo-johnson', standardize=True)
        h5ad.obs[obs_cols] = pete.fit_transform(h5ad.obs[obs_cols].values)

    figures = []
    # fig = plt.figure(figsize=(4*(len(obs_cols)//4), 4*2), dpi=150)
    for i, c in enumerate(obs_cols):
        # axs = fig.add_subplot(4, len(obs_cols)/4 + 1, i+1)
        sc.pl.umap(h5ad, color=c, return_fig=True, cmap='RdYlBu_r')
        fig = plt.gcf()
        fig.set_size_inches(5, 4)
        figures.append(fig)

    return figures
コード例 #8
0
    def dataloader(self):

        cols_drop = [
            "actual_load",
        ]
        X_train = self.train.drop(columns=cols_drop)
        y_train = self.train.actual_load
        X_test = self.test.drop(columns=cols_drop)
        y_test = self.test.actual_load
        X_val = self.val.drop(columns=cols_drop)
        y_val = self.val.actual_load

        if self.transform is not None:
            scaler = PowerTransformer(method="box-cox")
            y_train = scaler.fit_transform(
                np.array(self.train.actual_load).reshape(-1, 1))
            y_train = y_train.ravel()

            y_val = scaler.transform(
                np.array(self.val.actual_load).reshape(-1, 1))
            y_val = y_val.ravel()

            # Saving sklearn transformation file to be further used for inverse transformation in test.py
            scaler_filename = SCALER_FILENAME
            joblib.dump(scaler, scaler_filename)

        return X_train, y_train, X_val, y_val, X_test, y_test
コード例 #9
0
def data_preprocessing(dataset, model):
    '''
    # Function that pre process dataset
    # 1) Feature Label extraction
    # 2) Feature scaling
    # 3) Outlier detection
    '''

    # labels and features
    y = dataset.iloc[:, 1]
    X = dataset.drop(dataset.columns[[1]], axis=1)

    # remove 'INSTANCE_ID'
    X = X.drop(dataset.columns[[0]], axis=1)

    # Feature Normalization
    scaler = PowerTransformer() if model == NB else StandardScaler()
    X = scaler.fit_transform(X)
    y = y.to_numpy()

    # outlier detection and removal
    # lowest 0.5%  data removed as outlier
    out = LocalOutlierFactor(n_neighbors=20)
    out.fit_predict(X)
    lof = out.negative_outlier_factor_
    thresh = np.quantile(lof, 0.005)
    index = np.where(lof > thresh)
    X_selected = X[index]
    y_selected = y[index]

    return X, y, X_selected, y_selected
コード例 #10
0
def map_2_gaussian(X, mapping_method):
    '''Maps N*M data from any distribution to as close to a Gaussian distribution as possible in order to stabilize variance and minimize skewness.
    mapping method either 'box-cox' or 'yeo-johnson, standardize = False will apply zero-mean, unit-variance normalization to the transformed output by default.'''
    pt = PowerTransformer(method=mapping_method, standardize=False)
    data = [
        pt.fit_transform(X[i].reshape(1, -1)) for i in range(0, X.shape[0])
    ]
    return np.array(data).astype('float32')
コード例 #11
0
def transform(df):
    """This method is used to standardize data.
    : param df : a pandas DataFrame with values to transform,
    : return : a DataFrame with standardized values.
    """
    pt = PowerTransformer()
    result = pt.fit_transform(df)
    return result
コード例 #12
0
	def normalize_by_category(data, identifier_1, identifier_2, success_category, numeric_correlative):
		ID_dict = split_data_by_identifier(data, identifier_1, identifier_2)
		for key, item in zip(ID_dict.keys(), ID_dict.values()):
			item['success_category'] = item[success_category]
			item = item.drop(success_category, axis = 1)
			item = item[((item.success_category - item.success_category.mean()) / item.success_category.std()).abs() < 3]
			item['true_scores'] = item['success_category'].copy(deep=False)
			temp = item['success_category']
			item = item.drop('success_category', axis=1)
			pt = PowerTransformer()
			pt.fit_transform(item[['true_scores', numeric_correlative]])
			item['true_scores'] = (item.true_scores - item.true_scores.mean())/item.true_scores.std(ddof=0)
			item['success_category'] = temp.to_frame()
			item['true_scores'] = map_transform(item['true_scores'], 1, 10)
			ID_dict[key] = item
		chart = pd.concat(ID_dict.values())
		return chart
コード例 #13
0
def predXgbYJ():
    xg = xgb.XGBRegressor()
    xg.fit(X_t, y_t)
    r2 = xg.score(X_t, y_t)
    pt3 = PowerTransformer()
    test_t = pt3.fit_transform(test)
    pred_Elec_t = xg.predict(test_t)
    pred_Elec = pt2.inverse_transform(pred_Elec_t.reshape(-1, 1))
    return pred_Elec, r2
コード例 #14
0
def predRFYJ():
    rf = RandomForestRegressor(n_estimators=1400, random_state=42)
    rf.fit(X_t, y_t)  # Train the model on training data
    r2 = rf.score(X_t, y_t)  # Make predictions using the testing set
    pt3 = PowerTransformer()
    test_t = pt3.fit_transform(test)
    pred_Elec_t = rf.predict(test_t)
    pred_Elec = pt2.inverse_transform(pred_Elec_t.reshape(-1, 1))
    return pred_Elec, r2
コード例 #15
0
def predLinearYJ():
    lm = linear_model.LinearRegression()
    lm.fit(X_t, y_t)  # Train the model using the training sets
    r2 = lm.score(X_t, y_t)  # Make predictions using the testing set
    pt3 = PowerTransformer()
    test_t = pt3.fit_transform(test)
    pred_Elec_t = lm.predict(test_t)
    pred_Elec = pt2.inverse_transform(pred_Elec_t.reshape(-1, 1))
    return pred_Elec, r2
    def data_transformation(self, data):
        scaler = StandardScaler()
        standard_data = pd.DataFrame(scaler.fit_transform(
            data), columns=data.columns, index=data.index)

        transformer = PowerTransformer()
        transformed_data = pd.DataFrame(transformer.fit_transform(
            standard_data), columns=data.columns, index=data.index)

        return scaler, transformer, transformed_data
コード例 #17
0
def apply_yeojohnson(df):
    feature = pd.DataFrame(df)
    name = feature.columns
    print(name)
    pt = PowerTransformer(
        method='yeo-johnson',
        standardize=True,
    )
    tr_yeo = pt.fit_transform(feature)
    return pd.DataFrame(tr_yeo, columns=name)
コード例 #18
0
    def PowerScale(self, df, target):

        sc = PowerTransformer()
        x = df.drop(target, axis=1)
        scaled_features = sc.fit_transform(x)
        scaled_features_df = pd.DataFrame(scaled_features,
                                          index=x.index,
                                          columns=x.columns)
        scaled_features_df[target] = df[target]
        return scaled_features_df, "PowerTransformer()"
コード例 #19
0
ファイル: clustering.py プロジェクト: whuss/dashboard_react
def _get_normalized_input_data(device: str, start_date: date,
                               end_date: date) -> pd.DataFrame:
    data = _get_input_data(device, start_date, end_date)
    if data.empty:
        return data
    pt = PowerTransformer()
    normalized_data = pt.fit_transform(data)
    normalized_data = pd.DataFrame(normalized_data,
                                   columns=data.columns,
                                   index=data.index)
    return normalized_data
コード例 #20
0
def power_transform(df):
    for column in df.select_dtypes(include=['int', 'float']).columns:
        if column == 'TARGET' or column.startswith('SK_ID'):
            continue
        if column == 'AMT_INCOME_TOTAL':
            encoder = PowerTransformer(method='box-cox')
        else:
            encoder = PowerTransformer(method='yeo-johnson')
        df[column] = encoder.fit_transform(df[[column]])
        df[column] = df[column].astype('float32')
    return df
コード例 #21
0
def box_cox_transform(df, include_missing_value=False):
    num_cols = utl.get_numerical_columns(df)
    if include_missing_value:
        pos_cols = [c for c in num_cols if ~(df[c] <= 0.0).all()]
    else:
        pos_cols = [c for c in num_cols if (df[c] > 0.0).all()]

    pt = PowerTransformer(method='box-cox')
    df[pos_cols] = pt.fit_transform(df[pos_cols])

    return df
コード例 #22
0
 def transform(self, X):
     X = pd.DataFrame(X, columns=self.column_names)
     if (self.strategy == "scaler"):
         scaler = MinMaxScaler()
         X[self.numerical_cols] = scaler.fit_transform(
             X[self.numerical_cols])
     elif (self.strategy == "transformer"):
         transformer = PowerTransformer(method='yeo-johnson')
         X[self.numerical_cols] = transformer.fit_transform(
             X[self.numerical_cols])
     return X
コード例 #23
0
    def power_scaler(train, test):
        '''
        Apply a power transform featurewise to make data more Gaussian-like.
        '''

        scaler = PowerTransformer(method='yeo-johnson')

        train = scaler.fit_transform(train)
        test = scaler.transform(test)

        return train, test
コード例 #24
0
def model_main_classifier(C=0.05):
    ts_code = '399300.SZ'
    x_train, x_test, y_train, y_test = getdata(ts_code,
                                               type='classifier',
                                               startDate='20090101')
    transer = PowerTransformer(method='yeo-johnson')
    # print(x_train)
    x_train = transer.fit_transform(x_train)
    # print('-' * 80)
    # print(x_train)
    # print('=' * 80)
    # print(x_test)
    x_test = transer.transform(x_test)
    # print('-' * 80)
    # print(x_test)

    # return
    # print(x_train.shape)
    # print(x_train)
    # print(x_test.shape)
    # print(y_train.shape)
    # print(y_train)
    # print(y_test.shape)

    # 线性回归
    # model = LinearRegression()
    # 线性支持向量机 linearSVC
    model = LinearSVC(C=C)
    model.fit(x_train, y_train)
    # y_predictions = model.predict(x_test)
    # r2 = r2_score(y_test, y_predictions)
    # print('intercept:', model.intercept_)
    # print('coef:', model.coef_)
    # print('y_test:\n', y_test)
    # print('y_predictions:\n', y_predictions)
    print('linearSVC score:', model.score(x_test, y_test))
    # print('r2:', r2)

    # SVC
    model = SVC(kernel='linear', cache_size=1000)
    model.fit(x_train, y_train)
    print('SVC score:', model.score(x_test, y_test))
    # predictions = model.predict(x_test)
    predictions = model.predict(x_train)
    # print('y_test:', y_test)
    # print('y_predict:', predictions)
    # con = confusion_matrix(y_test, predictions)
    con = confusion_matrix(y_train, predictions)
    # con = confusion_matrix(y_test, predictions, labels=['up', 'down'])
    print(con)
    print(f'真实值中为True的次数: {y_train.sum()}')
    print(f'预测值中为True的次数: {predictions.sum()}')
    print(f'精度(precision):{precision_score(y_train, predictions)}')
コード例 #25
0
def transform_amplitude(inputfile, scale=True):
    amplitudes = np.fromfile(inputfile, dtype=np.float)
    n_samples = amplitudes.shape[0]
    amplitudes = amplitudes.reshape((n_samples, -1))

    bc = PowerTransformer(method='box-cox')
    yj = PowerTransformer(method='yeo-johnson')
    qt = QuantileTransformer(n_quantiles=n_samples,
                             output_distribution='normal')
    min_max_scaler = MinMaxScaler()

    bc_amplitudes = bc.fit_transform(amplitudes)
    yj_amplitudes = yj.fit_transform(amplitudes)
    qt_amplitudes = qt.fit_transform(amplitudes)

    if scale:
        bc_amplitudes = min_max_scaler.fit_transform(bc_amplitudes)
        yj_amplitudes = min_max_scaler.fit_transform(yj_amplitudes)
        qt_amplitudes = min_max_scaler.fit_transform(qt_amplitudes)

    return amplitudes, bc_amplitudes, yj_amplitudes, qt_amplitudes
コード例 #26
0
def transform4mancova(df, save_dir, file_name, ncolumns = None):
    '''

    :param df: manovav_df - or any data frames that you want to transform to a normal distribution.
    :param ncolumns: Which columns to normalize
    :return:
    '''
    pt = PowerTransformer(method='yeo-johnson', standardize=False)
    if ncolumns == None:
        ncolumns = df.columns[df.columns.str.contains('NARS') | df.columns.str.contains('BFI') | df.columns.str.contains('GODSPEED')]
    df[ncolumns] = pd.DataFrame(data = pt.fit_transform(df[ncolumns]), columns = ncolumns)
    df.to_csv(save_dir + 'normalized_' + file_name + '.csv')
コード例 #27
0
def normalize_features(dataset):
    num = num_features(dataset)
    pt = PowerTransformer()
    num = pd.DataFrame(data = pt.fit_transform(num),columns = list(num))
    print('plot after transformation with lambda :',pt.lambdas_)
    for i in list(num):   
        plt.figure(figsize = (5,5))
        sns.distplot(num[i])
        plt.show()
        print('Skewness :',num[i].skew())
        print('kurtosis :',num[i].kurtosis())
    return num    
コード例 #28
0
def transform_X_Power(X_train, column):

    # fit on training data column
    Power = PowerTransformer(method='yeo-johnson', standardize=True)
    
    # transform the training & Test data column
    X_train_Power = Power.fit_transform(X_train)

    X_train_Power = pd.DataFrame( X_train_Power, columns = column)
  

    return X_train_Power
コード例 #29
0
def power_transform(logged=None):
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.preprocessing import StandardScaler, PowerTransformer

    if logged == None:
        return print(
            "Error: must have list-like logged argument that contains 6 elements."
        )

    power = PowerTransformer()

    power_price = power.fit_transform(np.array(logged[0]).reshape(-1, 1))
    power_sqft_living = power.fit_transform(np.array(logged[1]).reshape(-1, 1))
    power_sqft_lot = power.fit_transform(np.array(logged[2]).reshape(-1, 1))
    power_sqft_living15 = power.fit_transform(
        np.array(logged[3]).reshape(-1, 1))
    power_sqft_lot15 = power.fit_transform(np.array(logged[4]).reshape(-1, 1))
    power_yard_size = power.fit_transform(np.array(logged[5]).reshape(-1, 1))

    plt.hist(power_price, bins='auto', color='r', alpha=.7)
    plt.hist(power_sqft_living, bins='auto', color='b', alpha=.7)
    plt.hist(power_sqft_lot, bins='auto', color='g', alpha=.7)
    plt.hist(power_sqft_living15, bins='auto', color='pink', alpha=.7)
    plt.hist(power_sqft_lot15, bins='auto', color='y', alpha=.7)
    plt.title('Power Transformed Variables (Centered around Zero)')
    powered_vars = [
        power_price, power_sqft_living, power_sqft_living15, power_sqft_lot,
        power_sqft_lot15, power_yard_size
    ]
    return powered_vars, plt.show()
コード例 #30
0
ファイル: data_processor.py プロジェクト: john-james-sf/Ames
    def transform(self, X,  **transform_params):       
        notify.entering(__class__.__name__, "transform")
        # Impute missing values as linear function of other features
        imputer = IterativeImputer()
        X[self._continuous] = imputer.fit_transform(X[self._continuous])

        # Power transformation to make feature distributions closer to Guassian
        power = PowerTransformer(method="yeo-johnson", standardize=False)
        X[self._continuous] = power.fit_transform(X[self._continuous])

        notify.leaving(__class__.__name__, "transform")
        
        return X
コード例 #31
0
colors = ['firebrick', 'darkorange', 'goldenrod',
          'seagreen', 'royalblue', 'darkorchid']

fig, axes = plt.subplots(nrows=4, ncols=3)
axes = axes.flatten()
axes_idxs = [(0, 3), (1, 4), (2, 5), (6, 9), (7, 10), (8, 11)]
axes_list = [(axes[i], axes[j]) for i, j in axes_idxs]


for distribution, color, axes in zip(distributions, colors, axes_list):
    name, X = distribution
    # scale all distributions to the range [0, 10]
    X = minmax_scale(X, feature_range=(1e-10, 10))

    # perform power transform
    X_trans = pt.fit_transform(X)
    lmbda = round(pt.lambdas_[0], 2)

    ax_original, ax_trans = axes

    ax_original.hist(X, color=color, bins=BINS)
    ax_original.set_title(name, fontsize=FONT_SIZE)
    ax_original.tick_params(axis='both', which='major', labelsize=FONT_SIZE)

    ax_trans.hist(X_trans, color=color, bins=BINS)
    ax_trans.set_title('{} after Box-Cox, $\lambda$ = {}'.format(name, lmbda),
                       fontsize=FONT_SIZE)
    ax_trans.tick_params(axis='both', which='major', labelsize=FONT_SIZE)


plt.tight_layout()