Example #1
0
 def test(samples, la=-20, lb=20):
     fig = plt.figure()
     ax = fig.add_subplot(111)
     prob = boxcox_normplot(samples, la, lb, plot=ax)
     best_lambda = boxcox_normmax(samples)
     ax.axvline(best_lambda, color='r')
     plt.show()
Example #2
0
def features_engineer(X):
    X["TotalSF"] = X["GrLivArea"] + X["TotalBsmtSF"]
    X["TotalPorchSF"] = X["OpenPorchSF"] + X["EnclosedPorch"] + X[
        "3SsnPorch"] + X["ScreenPorch"]
    X["TotalBath"] = X["FullBath"] + X["BsmtFullBath"] + 0.5 * (
        X["BsmtHalfBath"] + X["HalfBath"])

    cols = ["MSSubClass", "YrSold"]
    X[cols] = X[cols].astype("category")

    X["SinMoSold"] = np.sin(2 * np.pi * X["MoSold"] / 12)
    X["CosMoSold"] = np.cos(2 * np.pi * X["MoSold"] / 12)
    X = X.drop("MoSold", axis=1)

    skew = X.skew(numeric_only=True).abs()
    cols = skew[skew > 1].index
    for col in cols:
        X[col] = boxcox1p(X[col], boxcox_normmax(X[col] + 1))

    cols = X.select_dtypes(np.number).columns
    X[cols] = RobustScaler().fit_transform(X[cols])

    X = pd.get_dummies(X)

    X_train = X.loc[train.index]
    X_test = X.loc[test.index]
    return X_train, X_test
Example #3
0
    def test_mle(self):
        maxlog = stats.boxcox_normmax(self.x, method='mle')
        assert_allclose(maxlog, 1.758101, rtol=1e-6)

        # Check that boxcox() uses 'mle'
        _, maxlog_boxcox = stats.boxcox(self.x)
        assert_allclose(maxlog_boxcox, maxlog)
Example #4
0
    def test_mle(self):
        maxlog = stats.boxcox_normmax(self.x, method='mle')
        assert_allclose(maxlog, 1.758101, rtol=1e-6)

        # Check that boxcox() uses 'mle'
        _, maxlog_boxcox = stats.boxcox(self.x)
        assert_allclose(maxlog_boxcox, maxlog)
def resolve_skewness(complete_df, numeric_features):
    # %% ~~~~~ Resolve skewness ~~~~ TODO camuffa codice
    from scipy.stats import skew

    skew_features = complete_df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews = pd.DataFrame({'skew': skew_features})

    print()
    print('--------- SKEW OF FEATURES ----------')
    print(skew_features)
    print()

    from scipy.special import boxcox1p
    from scipy.stats import boxcox_normmax

    high_skew = skew_features[skew_features > 0.5]
    high_skew = high_skew
    skew_index = high_skew.index

    for i in skew_index:
        complete_df[i] = boxcox1p(complete_df[i],
                                  boxcox_normmax(complete_df[i] + 1))

    # Check it is adjusted
    skew_features2 = complete_df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews2 = pd.DataFrame({'skew': skew_features2})
    print()
    print('--------- SKEW OF FEATURES AFTER NORMALIZATION ----------')
    print(skew_features2)
    print()
    return complete_df
Example #6
0
def transform_numerical_features(df_train, df_test):
    """
    TODO currently deals with positive skewed features, not negative. Analyse this.
    :param df_train:
    :param df_test:
    :return:
    """
    # apply log, scaling features
    # SalePrice. Check log against log(1+x), log1p
    df_train['SalePrice'] = np.log1p(df_train['SalePrice'])
    # Check for skewed features
    features_skewness = []
    for k in df_train.columns:
        if k == 'SalePrice':
            pass
        elif df_train.dtypes[k] != object:
            features_skewness.append([k, np.float(stats.skew(df_train[k]))])
    features_skewness_df = pd.DataFrame(features_skewness,
                                        columns=['F', 'S']).sort_values(by='S')
    left_skewed_features = features_skewness_df[
        features_skewness_df['S'] > 0.5]['F']
    right_skewed_features = features_skewness_df[
        features_skewness_df['S'] < -0.5]['F']
    # Apply log for right-skewed features (skewness<-0.5)
    # Apply boxcox1p for left-skewed features (skewness>0.5)
    #     boxcox1p(x,lmbda):
    #       y = log(1+x) if lmbda==0
    #       y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0
    # the Box-Cox Power transformation only works if all the data is positive and greater than 0
    for f in left_skewed_features:
        box_cox_coef = stats.boxcox_normmax(df_train[f] + 1)
        df_train[f] = special.boxcox1p(df_train[f], box_cox_coef)
        df_test[f] = special.boxcox1p(df_test[f], box_cox_coef)
def dampenSkew(df, num_features):
    skew_matrix = df[num_features].apply(
        lambda column: skew(column)).sort_values(ascending=False)
    skewed_features = skew_matrix[(abs(skew_matrix) > 1.0)].index
    for feature in skewed_features:
        df[feature] = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1))
    return df
Example #8
0
def fix_skewed(x, numeric_dtypes):
    skew_features = x.select_dtypes(numeric_dtypes).apply(
        lambda x: skew(x)).sort_values(ascending=False)

    high_skew = skew_features[skew_features > 0.5]
    skew_index = high_skew.index
    for i in skew_index:
        x[i] = boxcox1p(x[i], boxcox_normmax(x[i] + 1))

    return x
Example #9
0
def fixing_skewness(data):
    ## Getting all the data that are not of "object" type.
    numeric_feats = data.dtypes[data.dtypes != "object"].index
    # Check the skew of all numerical features
    skewed_feats = data[numeric_feats].apply(lambda x: skew(x)).sort_values(
        ascending=False)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index
    for feat in skewed_features:
        data[feat] = boxcox1p(data[feat], boxcox_normmax(data[feat] + 1))
Example #10
0
 def skewed_transform(self, df):
     # transform the skewed,non-normal distribution data to
     # the normal distribution data
     skews_df = self.skew_check(df)  # the skewed data in df
     high_skews = skews_df[abs(skews_df) >
                           0.5]  # select the high_skews larger than 0.5
     # transform the skewed data; why +1, is not clear
     for ind in skews_df.index:
         df[ind] = boxcox1p(df[ind], boxcox_normmax(df[ind] + 1))
     return df
Example #11
0
    def __init__(self, input, name, boxcox=False, rewrite=False):
        super(Preprocessing, self).__init__()

        if not os.path.exists('data/utils'):
            os.makedirs('data/utils')

        path = 'data/utils/%s_params.npz' % name

        # Check if params require rewriting
        if os.path.exists(path):

            data = np.load(path)

            if (boxcox and data['l'] is None
                    or input.shape[1:] != data['mean'].shape):

                rewrite = True

        # Compute or read preprocessing params
        if not os.path.exists(path) or rewrite:

            if boxcox:

                # Compute boxcox transformation parameters
                input_ = input.reshape(input.shape[0], -1)
                l = np.empty(input_.shape[1])
                for i in range(len(l)):
                    l[i] = stats.boxcox_normmax(input_[:, i] + 1e-8,
                                                method='mle')
                self.l = l.reshape(input.shape[1:]).astype('float32')

                l = np.broadcast_to(self.l, input.shape) + 1e-8
                input = ((input + 1e-8)**l - 1) / l
            else:

                self.l = np.array([])

            self.mean = input.mean(0).astype('float32')
            self.std = input.std(0).astype('float32')
            self.min = input.min(0).astype('float32')
            self.max = input.max(0).astype('float32')
            np.savez(path,
                     l=self.l,
                     mean=self.mean,
                     std=self.std,
                     min=self.min,
                     max=self.max)
        else:

            self.l = data['l']
            self.mean = data['mean']
            self.std = data['std']
            self.min = data['min']
            self.max = data['max']
def skewed_features(self, threshold=0.75, fix=False, return_series=True):

    df = self
    feature_skew = df.apply(lambda x: skew(x)).sort_values(ascending=False)

    if fix is True:
        high_skew = feature_skew[feature_skew > threshold]
        skew_index = high_skew.index
        for feature in skew_index:
            self = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1))
    if return_series is True:
        return feature_skew
Example #13
0
def fix_skewness(df):
    feature_skew, numeric_features = feature_skewness(df)
    high_skew = feature_skew[feature_skew > 0.75]
    skew_index = high_skew.index

    for i in skew_index:
        df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

    skew_features = df[numeric_features].apply(lambda x: skew(x)).sort_values(
        ascending=False)
    skews = pd.DataFrame({'skew': skew_features})
    return df
Example #14
0
 def __remove_skew_boxcox(self, _df):
     # Code taken from https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition and adapted to our needs.
     df = _df.copy()
     for i in self.skew_index:
         try:
             df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))
         except (TypeError, ValueError) as E:
             if self.verbose > 0:
                 print(
                     str(E) +
                     '.\nThus, skipping the boxcox Transformation for {}.\n----'
                     .format(i))
     return df
Example #15
0
def boxcox(data):
    # 计算数据分布的偏度(skewness)
    skew_features = data[numeric_columns(data)].apply(
        lambda col_vals: skew(col_vals)).sort_values(ascending=False)
    print(skew_features)
    # 偏度高的进行boxcox转换为正态分布
    # Box和Cox提出的变换可以使线性回归模型满足线性性、独立性、方差齐次以及正态性的同时,又不丢失信息。
    high_skew = skew_features[skew_features > 0.5]
    for feature in high_skew.index:
        if (data[feature].min() >= 0):
            data[feature] = boxcox1p(data[feature],
                                     boxcox_normmax(data[feature] + 1))
    return data
Example #16
0
    def transform(self, X):
        numeric_columns = get_numeric_columns(X)

        skewed_columns = X[numeric_columns].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)
        skewed_columns = skewed_columns[abs(skewed_columns) > 0.5]
        skewed_features = skewed_columns.index

        for feat in skewed_features:
            #raise ValueError("Data must be positive.") occurs. 0인거 같음. 그냥 +1 ?
            X[feat] = boxcox1p(X[feat], boxcox_normmax(X[feat] + 1))

        return X
Example #17
0
def boxcox_on_skewed_features(df, target, s: float):
    """
    Get a df, target and s (skewness), returns boxcox on df.
    """

    num_features = list(df.select_dtypes(include=np.number).columns)
    num_features.remove(target)

    skewed_features = df[num_features].apply(lambda x: skew(x))
    high_skew = skewed_features[abs(skewed_features) > s]

    for feat in high_skew.index:
        df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))
Example #18
0
    def fixing_skewness(self):
        """
        Function takes in a dataframe and return fixed skewed dataframe
        """
        ## Getting all the data that are not of "object" type. 
        numeric = self.data.dtypes[self.data.dtypes != "object"].index

        # Check the skew of all numerical features
        skewed_feats = self.data[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)
        high_skew = skewed_feats[abs(skewed_feats) > 0.5]
        skewed_features = high_skew.index

        for feat in skewed_features:
            self.data[feat] = boxcox1p(self.data[feat], boxcox_normmax(self.data[feat] + 1))
Example #19
0
def applyBoxCox(features):
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerics = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numerics.append(i)
    skew_features = features[numerics].apply(lambda x: skew(x)).sort_values(ascending=False)

    high_skew = skew_features[skew_features > 0.5]
    skew_index = high_skew.index
    bc_alphas = {}
    for i in skew_index:
        alpha = boxcox_normmax(features[i]+1)
        bc_alphas[i] = alpha
        features[i] = boxcox1p(features[i], alpha)
    return features, bc_alphas
Example #20
0
    def skewed_features(self, threshold=0.75, fix=False, return_series=False):
        """
        Returns the list of numerical features that present skewness
        :return: A pandas Series with the features and their skewness
        """
        df = self.select('numerical')
        feature_skew = df.apply(lambda x: skew(x)).sort_values(ascending=False)

        if fix is True:
            high_skew = feature_skew[feature_skew > threshold]
            skew_index = high_skew.index
            for feature in skew_index:
                self.features[feature] = boxcox1p(
                    df[feature], boxcox_normmax(df[feature] + 1))
        if return_series is True:
            return feature_skew
def fix_skewness(df):
    """Fix skewness in dataframe
    
    Args:
        df (str): The dataframe (input dataset)
    
    Returns:
        df (str): Fixed skewness dataframe 
    """
    # Skewness of all numerical features
    num_feat = df.dtypes[df.dtypes != "object"].index
    skewed_num_feat = df[num_feat].apply(lambda x: skew(x)).sort_values(
        ascending=False)
    high_skew = skewed_num_feat[
        abs(skewed_num_feat) > 0.5].index  # high skewed if skewness above 0.5

    # Use boxocx transformation to fix skewness
    for feat in high_skew:
        df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))
def box_cox_transform(endog):
    logging.info(
        'BoxCox transform for everything, return lambdas and shifts for restoring original data'
    )
    endog_boxcox = pd.DataFrame(index=endog.index)
    lambdas = dict()
    shifts = dict()
    for column in endog.columns:
        # logging.info('Transforming column {} from boxcox space'.format(column))
        shift = 0
        if endog[column].min() <= 0.0:
            shift = -endog[column].min() * 1.01 + 1
        endog_boxcox[column] = endog[column] + shift
        lambdas[column] = stats.boxcox_normmax(endog_boxcox[column])
        endog_boxcox[column] = stats.boxcox(endog_boxcox[column],
                                            lambdas[column])
        shifts[column] = shift
        # logging.info('Last value for column {} : {}'.format(column, endog_boxcox[column][-1:]))
    return endog_boxcox, lambdas, shifts
Example #23
0
def fixing_skewness(df):
    """
    This function takes in a dataframe and return fixed skewed dataframe
    """
    ## Import necessary modules
    from scipy.stats import skew
    from scipy.special import boxcox1p
    from scipy.stats import boxcox_normmax

    ## Getting all the data that are not of "object" type.
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(
        ascending=False)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index

    for feat in skewed_features:
        df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))
Example #24
0
def fix_skewness(df, skew_thresh = 0.5):
    # Fix features that don't have a normal distribution by applying a Box-Cox transformation
    # Get all the numerical features in our dataset
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric = []
    for i in df.columns:
        if df[i].dtype in numeric_dtypes:
            numeric.append(i)

    # Calculate the skew for all the numeric features
    features_skew = df[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

    # Define all the features that have skewness above 0.5 as skewed and apply transformation
    high_skew = features_skew[features_skew > skew_thresh]

    # Apply Box-Cox transformation to skewed features
    for i in high_skew.index:
        df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

    return df
Example #25
0
def BCLambda(data):
    """
    Gives optimal value of Box-Cox power transformation index
    lambda

    Parameters
    ----------
    data: 1D numpy array
        data in 1D numpy array

    Returns
    -------
    blambda: float
        Box-Cox power transformation index
    """
    blambda = 0
    data_in = array(data, dtype=float)
    data_in = abs(data_in)
    blambda = boxcox_normmax(data_in, brack=(-1.0, 2.0))
    if blambda > 2.0:
        blambda = 2.0
    if blambda < -1.0:
        blambda = -1.0
    return blambda
Example #26
0
    return df_all

# Apply feature engineering to training and test data
X_train = FE(X_tr)
print("shape and columns of X_train after initial processing")
print(X_train.shape)
print(X_train.columns)
X_test = FE(X_te)
print("shape and columns of X_test after initial processing")
print(X_test.shape)
print(X_test.columns)

 # fix skew in some variables
fix_var = ['LotFrontage', 'LotArea', 'BsmtUnfSF', '1stFlrSF', 'GrLivArea','TotalSF']
for var in fix_var:
    lam = boxcox_normmax(X_train[var] + 1)
    X_train[var] = boxcox1p(X_train[var], lam )
    X_test[var] = boxcox1p(X_test[var],lam )

# Dummify X_train & X_test
Xd_train = create_Xdummy(X_train)
Xd_test = create_Xdummy(X_test)

# get rid of columns that only appear in 1 dataset but not the other
col1 = set(Xd_train.columns) - set(Xd_test.columns)
col2 = set(Xd_test.columns) - set(Xd_train.columns)
Xd_train.drop( col1, axis=1, inplace = True )
Xd_test.drop( col2, axis=1, inplace = True)

# remove dummy columns that have very very low number of 1s
col2del = []
 def fit(self,X):
     if not self.lmbdaGiven:
         self.lmbda=boxcox_normmax(X,method='mle')
Example #28
0
features.update(features[numerics].fillna(0))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)

skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(
    ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))

features = features.drop([
    'Utilities',
    'Street',
    'PoolQC',
], axis=1)

features['YrBltAndRemod'] = features['YearBuilt'] + features['YearRemodAdd']
features['TotalSF'] = features['TotalBsmtSF'] + features[
    '1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] +
                                 features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])
def boxcoxArrayArgs(X):
    return [boxcox_normmax(X[:,c]) for c in range(X.shape[1])]
Example #30
0
def Q_6_7():
    data = pd.read_csv('Data/LinearStatistic.csv')
    print('原始数据:\n',data)

    # 数据预处理
    data_list = data.to_dict()
    for i in range(0,len(data_list['x1'])):
        data_list['x1'][i] = data_list['x1'][i]*data_list['x1'][i]
    data = pd.DataFrame(data_list)
    # 数据描述
    print('数据描述:\n',data.describe())
    # 缺失值检验
    print('缺失值检验:\n',data[data.isnull() == True].count())

    data.boxplot()
    plt.savefig("result/线性统计/6.7/boxplot_linear.jpg")
    plt.title('数据特征分析')
    plt.show()
    # 相关系数矩阵 r(相关系数) = x和y的协方差/(x的标准差*y的标准差) == cov(x,y)/σx*σy
    # 相关系数0~0.3弱相关0.3~0.6中等程度相关0.6~1强相关
    print('相关系数: ',data.corr())


    # 通过加入一个参数kind='reg',seaborn可以添加一条最佳拟合直线和95%的置信带。
    sns.pairplot(data, x_vars=['x1','x2'], y_vars='y', height=7, aspect=0.8,kind = 'reg')
    plt.savefig("result/线性统计/6.7/pairplot_linear.jpg")
    plt.title('不同因素影响图')
    plt.show()

    X_train,X_test,Y_train,Y_test = train_test_split(data.ix[:,:2],data.ix[:,2:3],train_size=.80)

    print("原始数据特征:",data.ix[:,:2].shape,
          ",训练数据特征:",X_train.shape,
          ",测试数据特征:",X_test.shape)

    print("原始数据标签:",data.ix[:,2:3].shape,
          ",训练数据标签:",Y_train.shape,
          ",测试数据标签:",Y_test.shape)

    model = LinearRegression()
    model.fit(X_train,Y_train)
    a = model.intercept_  # 截距
    b = model.coef_  # 回归系数
    print("最佳拟合线:截距", a, ",回归系数:",b)

    # R方检测
    # 决定系数r平方
    # 对于评估模型的精确度
    # y误差平方和 = Σ(y实际值 - y预测值)^2
    # y的总波动 = Σ(y实际值 - y平均值)^2
    # 有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动
    # 有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方
    # 对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化)
    # 2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合
    score = model.score(X_test,Y_test)
    print('相关系数',score)

    # 对线性回归进行预测
    Y_pred = model.predict(X_test)
    print(Y_pred)


    # 显示图像
    plt.figure()
    plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict")
    plt.plot(range(len(Y_pred)),Y_test,'r',label="test")
    plt.legend(loc="upper right")  # 显示图中的标签
    plt.xlabel("Y")
    plt.ylabel('the number of Y')
    plt.title('预测与源数据对比图')
    plt.savefig("result/线性统计/6.7/compare_linear.jpg")
    plt.show()


    # 残差预测值
    #  enumerate 函数可以把一个 list 变成索引-元素对
    y_dif = []
    for i in range(len(Y_pred)):
        y_dif.append(Y_pred[i,0]-Y_test['y'].values[i])
    tmp = {'x':range(len(y_dif)),'y':y_dif}
    df = pd.DataFrame(tmp)
    sns.residplot(x="x", y="y",data=df)
    plt.savefig("result/线性统计/6.7/残差图1.jpg")
    plt.title('残差图1')
    plt.show()

    # box-cox变换
    Y = data['y'].tolist()
    lam_best = boxcox_normmax(Y)
    print('lambda: ',lam_best)
    Y = special.boxcox1p(Y, lam_best)

    # box-cox变换后的回归
    for i in range(len(Y)):
        data_list['y'][i] = Y[i]
    data = pd.DataFrame(data_list)


    data.boxplot()
    plt.savefig("result/线性统计/6.7/boxplot_linear_bxo_cox.jpg")
    plt.title('bxo-cox后的数据特征分析')
    plt.show()
    # 相关系数矩阵 r(相关系数) = x和y的协方差/(x的标准差*y的标准差) == cov(x,y)/σx*σy
    # 相关系数0~0.3弱相关0.3~0.6中等程度相关0.6~1强相关
    print('相关系数: ',data.corr())


    # 通过加入一个参数kind='reg',seaborn可以添加一条最佳拟合直线和95%的置信带。
    sns.pairplot(data, x_vars=['x1','x2'], y_vars='y', height=7, aspect=0.8,kind = 'reg')
    plt.savefig("result/线性统计/6.7/pairplot_linear_bxo_cox.jpg")
    plt.title('bxo-cox后的不同因素影响图')
    plt.show()

    X_train,X_test,Y_train,Y_test = train_test_split(data.ix[:,:2],data.ix[:,2:3],train_size=.80)
    model = LinearRegression()
    model.fit(X_train,Y_train)
    a = model.intercept_  # 截距
    b = model.coef_  # 回归系数
    print("bxo-cox后的最佳拟合线:截距", a, ",回归系数:",b)

    # R方检测
    # 决定系数r平方
    # 对于评估模型的精确度
    # y误差平方和 = Σ(y实际值 - y预测值)^2
    # y的总波动 = Σ(y实际值 - y平均值)^2
    # 有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动
    # 有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方
    # 对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化)
    # 2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合
    score = model.score(X_test,Y_test)
    print('R方检测 ',score)

    # 对线性回归进行预测
    Y_pred = model.predict(X_test)
    print(Y_pred)

    # 显示图像
    plt.figure()
    plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict")
    plt.plot(range(len(Y_pred)),Y_test,'r',label="test")
    plt.legend(loc="upper right")  # 显示图中的标签
    plt.ylabel("Y")
    plt.xlabel('the number of Y')
    plt.title('bxo-cox后的预测与源数据对比图')
    plt.savefig("result/线性统计/6.7/compare_linear_box_cox.jpg")
    plt.show()


    # 残差预测值
    #  enumerate 函数可以把一个 list 变成索引-元素对
    y_dif = []
    for i in range(len(Y_pred)):
        y_dif.append(Y_pred[i,0]-Y_test['y'].values[i])
    tmp = {'x':range(len(y_dif)),'y':y_dif}
    df = pd.DataFrame(tmp)
    sns.residplot(x="x", y="y",data=df)
    plt.savefig("result/线性统计/6.7/残差图_bxo_cox.jpg")
    plt.title('bxo-cox后的残差图')
    plt.show()
Example #31
0
def feature_selection(train=pd.DataFrame(), test=pd.DataFrame()):
    explore_data_analysis(train)
    train = train[train['GrLivArea'] < 4500]
    train['SalePrice'] = train['SalePrice'].map(lambda x: math.log(1 + x))
    y = train['SalePrice']
    train_features = train.drop("SalePrice", axis=1)
    test_features = test
    features = pd.concat([train_features, test_features], sort=False)
    features['MSSubClass'] = features['MSSubClass'].apply(str)
    features['YrSold'] = features['YrSold'].astype(str)
    features['MoSold'] = features['MoSold'].astype(str)
    features['Functional'] = features['Functional'].fillna('Typ')
    features['Electrical'] = features['Electrical'].fillna("SBrkr")
    features['KitchenQual'] = features['KitchenQual'].fillna("TA")
    features["PoolQC"] = features["PoolQC"].fillna("None")
    features['Exterior1st'] = features['Exterior1st'].fillna(
        features['Exterior1st'].mode()[0])
    features['Exterior2nd'] = features['Exterior2nd'].fillna(
        features['Exterior2nd'].mode()[0])
    features['SaleType'] = features['SaleType'].fillna(
        features['SaleType'].mode()[0])
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        features[col] = features[col].fillna(0)
    for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        features[col] = features[col].fillna('None')
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                'BsmtFinType2'):
        features[col] = features[col].fillna('None')
    features['MSZoning'] = features.groupby(
        'MSSubClass')['MSZoning'].transform(
            lambda x: x.fillna(x.mode()[0]))  # 填充出现频率最高的数(众数)
    object = list(features.select_dtypes(include='object').columns)
    features.update(features[object].fillna('None'))
    features['LotFrontage'] = features.groupby(
        'Neighborhood')['LotFrontage'].transform(
            lambda x: x.fillna(x.median()))  # 填充中位数
    numeric_dtypes = list(
        features.select_dtypes(include=[
            'int16', 'int32', 'int64', 'float16', 'float32', 'float64'
        ]).columns)
    features.update(features[numeric_dtypes].fillna(0))

    # 统计所有偏度 > 0.5 的features
    skew_features = features[numeric_dtypes].apply(
        lambda x: skew(x)).sort_values(
            ascending=False)  # 默认是从小到大排序, 这里不进行排序,节约时间

    high_skew = skew_features[skew_features > 0.5]
    skew_high_index = high_skew.index
    for i in skew_high_index:
        # 对偏度大于0.5的列进行box-cox 正态化矫正
        features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))
    features = features.drop(['Utilities', 'Street', 'PoolQC'], axis=1)
    features[
        'YrBltAndRemod'] = features['YearBuilt'] + features['YearRemodAdd']
    features['TotalSF'] = features['TotalBsmtSF'] + features[
        '1stFlrSF'] + features['2ndFlrSF']

    features['Total_sqr_footage'] = (features['BsmtFinSF1'] +
                                     features['BsmtFinSF2'] +
                                     features['1stFlrSF'] +
                                     features['2ndFlrSF'])

    features['Total_Bathrooms'] = (features['FullBath'] +
                                   (0.5 * features['HalfBath']) +
                                   features['BsmtFullBath'] +
                                   (0.5 * features['BsmtHalfBath']))

    features['Total_porch_sf'] = (features['OpenPorchSF'] +
                                  features['3SsnPorch'] +
                                  features['EnclosedPorch'] +
                                  features['ScreenPorch'] +
                                  features['WoodDeckSF'])
    features['haspool'] = features['PoolArea'].apply(lambda x: 1
                                                     if x > 0 else 0)
    features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1
                                                         if x > 0 else 0)
    features['hasgarage'] = features['GarageArea'].apply(lambda x: 1
                                                         if x > 0 else 0)
    features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1
                                                        if x > 0 else 0)
    features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1
                                                            if x > 0 else 0)
    # 对所有变量进行扩充处理
    final_features = pd.get_dummies(features)
    # 分离X_train, Y_train, X_test
    X = final_features.iloc[:len(y), :]
    test = final_features.iloc[len(y):, :]
    # print("X shape:{}\t y shape:{}\t test shape:{}".format(X.shape, y.shape, test.shape))
    overfit = []
    for i in X.columns:
        counts = X[i].value_counts()
        zeros = counts.iloc[0]  # 获取 i 列中的众数
        if zeros / len(X[i]) * 100 > 99.94:
            overfit.append(i)  # 占比超过 99.94 数据容易过拟合
    X = X.drop(overfit, axis=1)
    test = test.drop(overfit, axis=1)
    print("X shape:{}\t y shape:{}\t test shape:{}".format(
        X.shape, y.shape, test.shape))
    return X, y, test
Example #32
0
def ets(y, horizon, quantile=0.95, opt_crit='lik', obj='rmse', boxcox=True):
    """
    Calls R for each model and returns fcast, model and metrics.
    The model selection is based on AIC but the model params are obtained to minimize opt_crit
    Iterating by hand rather than setting model = 'ZZZ' because some models are ignored sometimes with 'ZZZ'. Not clear why.
    Assumes all the data is > 0.
    :param y: original time series
    :param boxcox: if true is boxcox
    :param horizon: how many points to predict
    :param quantile: CI for forecast error
    :param opt_crit: what to optimize to get the model parameters (lik, mse, sigma, ...). See R
    :param sel_crit: metric to use to select the best model (aic or rmse). Better model if sel_crit is smaller.
    :return: a dict with keys: df_out, model, params, rmse, states (the HW recursion values)
    """
    h_r = robjects.IntVector([horizon])
    lvl_r = robjects.FloatVector([quantile])
    opt_r = robjects.StrVector([opt_crit])
    false_r = robjects.BoolVector([False])
    true_r = robjects.BoolVector([True])
    null_r = robjects.NULL
    env = robjects.r.globalenv()

    # get good BoxCox values for lambda
    y_r = robjects.FloatVector(y)
    if boxcox is True:
        lambdas = list(sps.boxcox_normmax(y, brack=(-5, 0), method='all')) + list(sps.boxcox_normmax(y, brack=(0, 5), method='all'))
        lambda_r = fcast.BoxCox_lambda(y_r, method='guerrero', lower=0, upper=5)[0]
        lambdas.append(lambda_r)
        lambda_r = fcast.BoxCox_lambda(y_r, method='guerrero', lower=-5, upper=0)[0]
        lambdas.append(lambda_r)
        lambdas.append(None)
        lambdas = list(set(lambdas))
    else:
        lambdas = [None]
    data_in = [(y_r, l) for l in lambdas]

    lbda_opt, rmse_opt, season_opt, yhat, yupr, ylwr, params, states, aic_opt, obj_opt = None, None, None, None, None, None, None, None, None, None

    for y_r, l_val in data_in:
        season = int(r_frequency(y_r, env))
        y_r = set_tsp(y_r, season, env)
        season_mdls = ['M', 'A', 'N'] if (season > 1 and len(y) >= 4 * season) else ['N']

        for m in itertools.product(['M', 'A'], ['N', 'M', 'A'], season_mdls):  # iterate over all models
            model = ''.join(m)
            damp_list = [False, True] if m[1] != 'N' else [False]  # '[false_r, true_r] if m[1] != 'N' else [false_r]         # new y_r with stp attributes
            for dp in damp_list:
                d_r = robjects.BoolVector([False]) if dp is False else robjects.BoolVector([True])
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")
                    try:
                        lbda_r = null_r if l_val is None else robjects.FloatVector([l_val])
                        etsObj_r = fcast.ets(y_r, damped=d_r, model=model, restrict=false_r, **{'opt.crit': opt_r, 'allow.multiplicative.trend': true_r, 'lambda': lbda_r})
                        fcast_obj_r = fcast.forecast(etsObj_r, h=h_r, level=lvl_r, **{'find.frequency': true_r, 'lambda': lbda_r})
                        mdl_rmse = np.sqrt(etsObj_r.rx2('mse')[0])
                        mdl_aic = etsObj_r.rx2('aic')[0]
                        mdl_obj = mdl_aic if obj == 'aic' else mdl_rmse

                        # model selection: take the smallest value
                        no_nans = fit_nan_check(np.array([x for x in fcast_obj_r.rx2('mean')]))
                        if (obj_opt is None or mdl_obj < obj_opt) and no_nans:
                            rmse_opt = mdl_rmse
                            aic_opt = mdl_aic
                            lbda_opt = l_val
                            states = etsObj_r.rx2('states')
                            params = {k: v for k, v in etsObj_r.rx2('par').items()}
                            params['model'] = etsObj_r.rx2('method')[0]      # opt_mdl
                            params['lambda'] = l_val
                            params['season'] = season
                            params['type'] = 'ets'
                            yhat = [x for x in fcast_obj_r.rx2('fitted')] + [x for x in fcast_obj_r.rx2('mean')]    # append to one-step fcast the out-of-sample fcast
                            yupr = [x for x in fcast_obj_r.rx2('fitted')] + [x for x in fcast_obj_r.rx2('upper')]
                            ylwr = [x for x in fcast_obj_r.rx2('fitted')] + [x for x in fcast_obj_r.rx2('lower')]

                        params['no_nans'] = no_nans
                        my_log.debug('\tr_ets processed model::rmse: ' + str(mdl_rmse) + ' aic: ' + str(mdl_aic) + ' params: ' + str(params))

                    except rinterface.RRuntimeError:
                        my_log.debug('\tr_ets failed model:   ' + str(model) + ' damped: ' + str(dp) + ' lbda: ' + str(l_val))
                        continue

    my_log.debug('\tr_ets opt model: ' + str(params['model']) + ' rmse: ' + str(rmse_opt) + ' lambda: ' + str(lbda_opt))
    y = list(y) + [np.nan] * horizon
    df = pd.DataFrame({'y': y, 'yhat': yhat, 'yupr': yupr, 'ylwr': ylwr})
    return {'df_out': df, 'model': params, 'rmse': rmse_opt, 'states': states, 'aic': aic_opt}
Example #33
0
 def test_all(self):
     maxlog_all = stats.boxcox_normmax(self.x, method='all')
     assert_allclose(maxlog_all, [1.804465, 1.758101], rtol=1e-6)
Example #34
0
 def test_all(self):
     maxlog_all = stats.boxcox_normmax(self.x, method='all')
     assert_allclose(maxlog_all, [1.804465325046, 1.758101454114])
Example #35
0
 def test_pearsonr(self):
     maxlog = stats.boxcox_normmax(self.x)
     assert_allclose(maxlog, 1.804465, rtol=1e-6)
Example #36
0
 def test_all(self):
     maxlog_all = stats.boxcox_normmax(self.x, method='all')
     assert_allclose(maxlog_all, [1.804465, 1.758101], rtol=1e-6)
Example #37
0
 def test_pearsonr(self):
     maxlog = stats.boxcox_normmax(self.x)
     assert_allclose(maxlog, 1.804465, rtol=1e-6)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()