def test(samples, la=-20, lb=20): fig = plt.figure() ax = fig.add_subplot(111) prob = boxcox_normplot(samples, la, lb, plot=ax) best_lambda = boxcox_normmax(samples) ax.axvline(best_lambda, color='r') plt.show()
def features_engineer(X): X["TotalSF"] = X["GrLivArea"] + X["TotalBsmtSF"] X["TotalPorchSF"] = X["OpenPorchSF"] + X["EnclosedPorch"] + X[ "3SsnPorch"] + X["ScreenPorch"] X["TotalBath"] = X["FullBath"] + X["BsmtFullBath"] + 0.5 * ( X["BsmtHalfBath"] + X["HalfBath"]) cols = ["MSSubClass", "YrSold"] X[cols] = X[cols].astype("category") X["SinMoSold"] = np.sin(2 * np.pi * X["MoSold"] / 12) X["CosMoSold"] = np.cos(2 * np.pi * X["MoSold"] / 12) X = X.drop("MoSold", axis=1) skew = X.skew(numeric_only=True).abs() cols = skew[skew > 1].index for col in cols: X[col] = boxcox1p(X[col], boxcox_normmax(X[col] + 1)) cols = X.select_dtypes(np.number).columns X[cols] = RobustScaler().fit_transform(X[cols]) X = pd.get_dummies(X) X_train = X.loc[train.index] X_test = X.loc[test.index] return X_train, X_test
def test_mle(self): maxlog = stats.boxcox_normmax(self.x, method='mle') assert_allclose(maxlog, 1.758101, rtol=1e-6) # Check that boxcox() uses 'mle' _, maxlog_boxcox = stats.boxcox(self.x) assert_allclose(maxlog_boxcox, maxlog)
def resolve_skewness(complete_df, numeric_features): # %% ~~~~~ Resolve skewness ~~~~ TODO camuffa codice from scipy.stats import skew skew_features = complete_df[numeric_features].apply( lambda x: skew(x)).sort_values(ascending=False) skews = pd.DataFrame({'skew': skew_features}) print() print('--------- SKEW OF FEATURES ----------') print(skew_features) print() from scipy.special import boxcox1p from scipy.stats import boxcox_normmax high_skew = skew_features[skew_features > 0.5] high_skew = high_skew skew_index = high_skew.index for i in skew_index: complete_df[i] = boxcox1p(complete_df[i], boxcox_normmax(complete_df[i] + 1)) # Check it is adjusted skew_features2 = complete_df[numeric_features].apply( lambda x: skew(x)).sort_values(ascending=False) skews2 = pd.DataFrame({'skew': skew_features2}) print() print('--------- SKEW OF FEATURES AFTER NORMALIZATION ----------') print(skew_features2) print() return complete_df
def transform_numerical_features(df_train, df_test): """ TODO currently deals with positive skewed features, not negative. Analyse this. :param df_train: :param df_test: :return: """ # apply log, scaling features # SalePrice. Check log against log(1+x), log1p df_train['SalePrice'] = np.log1p(df_train['SalePrice']) # Check for skewed features features_skewness = [] for k in df_train.columns: if k == 'SalePrice': pass elif df_train.dtypes[k] != object: features_skewness.append([k, np.float(stats.skew(df_train[k]))]) features_skewness_df = pd.DataFrame(features_skewness, columns=['F', 'S']).sort_values(by='S') left_skewed_features = features_skewness_df[ features_skewness_df['S'] > 0.5]['F'] right_skewed_features = features_skewness_df[ features_skewness_df['S'] < -0.5]['F'] # Apply log for right-skewed features (skewness<-0.5) # Apply boxcox1p for left-skewed features (skewness>0.5) # boxcox1p(x,lmbda): # y = log(1+x) if lmbda==0 # y = ((1+x)**lmbda - 1) / lmbda if lmbda != 0 # the Box-Cox Power transformation only works if all the data is positive and greater than 0 for f in left_skewed_features: box_cox_coef = stats.boxcox_normmax(df_train[f] + 1) df_train[f] = special.boxcox1p(df_train[f], box_cox_coef) df_test[f] = special.boxcox1p(df_test[f], box_cox_coef)
def dampenSkew(df, num_features): skew_matrix = df[num_features].apply( lambda column: skew(column)).sort_values(ascending=False) skewed_features = skew_matrix[(abs(skew_matrix) > 1.0)].index for feature in skewed_features: df[feature] = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1)) return df
def fix_skewed(x, numeric_dtypes): skew_features = x.select_dtypes(numeric_dtypes).apply( lambda x: skew(x)).sort_values(ascending=False) high_skew = skew_features[skew_features > 0.5] skew_index = high_skew.index for i in skew_index: x[i] = boxcox1p(x[i], boxcox_normmax(x[i] + 1)) return x
def fixing_skewness(data): ## Getting all the data that are not of "object" type. numeric_feats = data.dtypes[data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = data[numeric_feats].apply(lambda x: skew(x)).sort_values( ascending=False) high_skew = skewed_feats[abs(skewed_feats) > 0.5] skewed_features = high_skew.index for feat in skewed_features: data[feat] = boxcox1p(data[feat], boxcox_normmax(data[feat] + 1))
def skewed_transform(self, df): # transform the skewed,non-normal distribution data to # the normal distribution data skews_df = self.skew_check(df) # the skewed data in df high_skews = skews_df[abs(skews_df) > 0.5] # select the high_skews larger than 0.5 # transform the skewed data; why +1, is not clear for ind in skews_df.index: df[ind] = boxcox1p(df[ind], boxcox_normmax(df[ind] + 1)) return df
def __init__(self, input, name, boxcox=False, rewrite=False): super(Preprocessing, self).__init__() if not os.path.exists('data/utils'): os.makedirs('data/utils') path = 'data/utils/%s_params.npz' % name # Check if params require rewriting if os.path.exists(path): data = np.load(path) if (boxcox and data['l'] is None or input.shape[1:] != data['mean'].shape): rewrite = True # Compute or read preprocessing params if not os.path.exists(path) or rewrite: if boxcox: # Compute boxcox transformation parameters input_ = input.reshape(input.shape[0], -1) l = np.empty(input_.shape[1]) for i in range(len(l)): l[i] = stats.boxcox_normmax(input_[:, i] + 1e-8, method='mle') self.l = l.reshape(input.shape[1:]).astype('float32') l = np.broadcast_to(self.l, input.shape) + 1e-8 input = ((input + 1e-8)**l - 1) / l else: self.l = np.array([]) self.mean = input.mean(0).astype('float32') self.std = input.std(0).astype('float32') self.min = input.min(0).astype('float32') self.max = input.max(0).astype('float32') np.savez(path, l=self.l, mean=self.mean, std=self.std, min=self.min, max=self.max) else: self.l = data['l'] self.mean = data['mean'] self.std = data['std'] self.min = data['min'] self.max = data['max']
def skewed_features(self, threshold=0.75, fix=False, return_series=True): df = self feature_skew = df.apply(lambda x: skew(x)).sort_values(ascending=False) if fix is True: high_skew = feature_skew[feature_skew > threshold] skew_index = high_skew.index for feature in skew_index: self = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1)) if return_series is True: return feature_skew
def fix_skewness(df): feature_skew, numeric_features = feature_skewness(df) high_skew = feature_skew[feature_skew > 0.75] skew_index = high_skew.index for i in skew_index: df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1)) skew_features = df[numeric_features].apply(lambda x: skew(x)).sort_values( ascending=False) skews = pd.DataFrame({'skew': skew_features}) return df
def __remove_skew_boxcox(self, _df): # Code taken from https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition and adapted to our needs. df = _df.copy() for i in self.skew_index: try: df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1)) except (TypeError, ValueError) as E: if self.verbose > 0: print( str(E) + '.\nThus, skipping the boxcox Transformation for {}.\n----' .format(i)) return df
def boxcox(data): # 计算数据分布的偏度(skewness) skew_features = data[numeric_columns(data)].apply( lambda col_vals: skew(col_vals)).sort_values(ascending=False) print(skew_features) # 偏度高的进行boxcox转换为正态分布 # Box和Cox提出的变换可以使线性回归模型满足线性性、独立性、方差齐次以及正态性的同时,又不丢失信息。 high_skew = skew_features[skew_features > 0.5] for feature in high_skew.index: if (data[feature].min() >= 0): data[feature] = boxcox1p(data[feature], boxcox_normmax(data[feature] + 1)) return data
def transform(self, X): numeric_columns = get_numeric_columns(X) skewed_columns = X[numeric_columns].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) skewed_columns = skewed_columns[abs(skewed_columns) > 0.5] skewed_features = skewed_columns.index for feat in skewed_features: #raise ValueError("Data must be positive.") occurs. 0인거 같음. 그냥 +1 ? X[feat] = boxcox1p(X[feat], boxcox_normmax(X[feat] + 1)) return X
def boxcox_on_skewed_features(df, target, s: float): """ Get a df, target and s (skewness), returns boxcox on df. """ num_features = list(df.select_dtypes(include=np.number).columns) num_features.remove(target) skewed_features = df[num_features].apply(lambda x: skew(x)) high_skew = skewed_features[abs(skewed_features) > s] for feat in high_skew.index: df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))
def fixing_skewness(self): """ Function takes in a dataframe and return fixed skewed dataframe """ ## Getting all the data that are not of "object" type. numeric = self.data.dtypes[self.data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = self.data[numeric].apply(lambda x: skew(x)).sort_values(ascending=False) high_skew = skewed_feats[abs(skewed_feats) > 0.5] skewed_features = high_skew.index for feat in skewed_features: self.data[feat] = boxcox1p(self.data[feat], boxcox_normmax(self.data[feat] + 1))
def applyBoxCox(features): numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numerics = [] for i in features.columns: if features[i].dtype in numeric_dtypes: numerics.append(i) skew_features = features[numerics].apply(lambda x: skew(x)).sort_values(ascending=False) high_skew = skew_features[skew_features > 0.5] skew_index = high_skew.index bc_alphas = {} for i in skew_index: alpha = boxcox_normmax(features[i]+1) bc_alphas[i] = alpha features[i] = boxcox1p(features[i], alpha) return features, bc_alphas
def skewed_features(self, threshold=0.75, fix=False, return_series=False): """ Returns the list of numerical features that present skewness :return: A pandas Series with the features and their skewness """ df = self.select('numerical') feature_skew = df.apply(lambda x: skew(x)).sort_values(ascending=False) if fix is True: high_skew = feature_skew[feature_skew > threshold] skew_index = high_skew.index for feature in skew_index: self.features[feature] = boxcox1p( df[feature], boxcox_normmax(df[feature] + 1)) if return_series is True: return feature_skew
def fix_skewness(df): """Fix skewness in dataframe Args: df (str): The dataframe (input dataset) Returns: df (str): Fixed skewness dataframe """ # Skewness of all numerical features num_feat = df.dtypes[df.dtypes != "object"].index skewed_num_feat = df[num_feat].apply(lambda x: skew(x)).sort_values( ascending=False) high_skew = skewed_num_feat[ abs(skewed_num_feat) > 0.5].index # high skewed if skewness above 0.5 # Use boxocx transformation to fix skewness for feat in high_skew: df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))
def box_cox_transform(endog): logging.info( 'BoxCox transform for everything, return lambdas and shifts for restoring original data' ) endog_boxcox = pd.DataFrame(index=endog.index) lambdas = dict() shifts = dict() for column in endog.columns: # logging.info('Transforming column {} from boxcox space'.format(column)) shift = 0 if endog[column].min() <= 0.0: shift = -endog[column].min() * 1.01 + 1 endog_boxcox[column] = endog[column] + shift lambdas[column] = stats.boxcox_normmax(endog_boxcox[column]) endog_boxcox[column] = stats.boxcox(endog_boxcox[column], lambdas[column]) shifts[column] = shift # logging.info('Last value for column {} : {}'.format(column, endog_boxcox[column][-1:])) return endog_boxcox, lambdas, shifts
def fixing_skewness(df): """ This function takes in a dataframe and return fixed skewed dataframe """ ## Import necessary modules from scipy.stats import skew from scipy.special import boxcox1p from scipy.stats import boxcox_normmax ## Getting all the data that are not of "object" type. numeric_feats = df.dtypes[df.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values( ascending=False) high_skew = skewed_feats[abs(skewed_feats) > 0.5] skewed_features = high_skew.index for feat in skewed_features: df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))
def fix_skewness(df, skew_thresh = 0.5): # Fix features that don't have a normal distribution by applying a Box-Cox transformation # Get all the numerical features in our dataset numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numeric = [] for i in df.columns: if df[i].dtype in numeric_dtypes: numeric.append(i) # Calculate the skew for all the numeric features features_skew = df[numeric].apply(lambda x: skew(x)).sort_values(ascending=False) # Define all the features that have skewness above 0.5 as skewed and apply transformation high_skew = features_skew[features_skew > skew_thresh] # Apply Box-Cox transformation to skewed features for i in high_skew.index: df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1)) return df
def BCLambda(data): """ Gives optimal value of Box-Cox power transformation index lambda Parameters ---------- data: 1D numpy array data in 1D numpy array Returns ------- blambda: float Box-Cox power transformation index """ blambda = 0 data_in = array(data, dtype=float) data_in = abs(data_in) blambda = boxcox_normmax(data_in, brack=(-1.0, 2.0)) if blambda > 2.0: blambda = 2.0 if blambda < -1.0: blambda = -1.0 return blambda
return df_all # Apply feature engineering to training and test data X_train = FE(X_tr) print("shape and columns of X_train after initial processing") print(X_train.shape) print(X_train.columns) X_test = FE(X_te) print("shape and columns of X_test after initial processing") print(X_test.shape) print(X_test.columns) # fix skew in some variables fix_var = ['LotFrontage', 'LotArea', 'BsmtUnfSF', '1stFlrSF', 'GrLivArea','TotalSF'] for var in fix_var: lam = boxcox_normmax(X_train[var] + 1) X_train[var] = boxcox1p(X_train[var], lam ) X_test[var] = boxcox1p(X_test[var],lam ) # Dummify X_train & X_test Xd_train = create_Xdummy(X_train) Xd_test = create_Xdummy(X_test) # get rid of columns that only appear in 1 dataset but not the other col1 = set(Xd_train.columns) - set(Xd_test.columns) col2 = set(Xd_test.columns) - set(Xd_train.columns) Xd_train.drop( col1, axis=1, inplace = True ) Xd_test.drop( col2, axis=1, inplace = True) # remove dummy columns that have very very low number of 1s col2del = []
def fit(self,X): if not self.lmbdaGiven: self.lmbda=boxcox_normmax(X,method='mle')
features.update(features[numerics].fillna(0)) numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numerics2 = [] for i in features.columns: if features[i].dtype in numeric_dtypes: numerics2.append(i) skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values( ascending=False) high_skew = skew_features[skew_features > 0.5] skew_index = high_skew.index for i in skew_index: features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1)) features = features.drop([ 'Utilities', 'Street', 'PoolQC', ], axis=1) features['YrBltAndRemod'] = features['YearBuilt'] + features['YearRemodAdd'] features['TotalSF'] = features['TotalBsmtSF'] + features[ '1stFlrSF'] + features['2ndFlrSF'] features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] + features['1stFlrSF'] + features['2ndFlrSF'])
def boxcoxArrayArgs(X): return [boxcox_normmax(X[:,c]) for c in range(X.shape[1])]
def Q_6_7(): data = pd.read_csv('Data/LinearStatistic.csv') print('原始数据:\n',data) # 数据预处理 data_list = data.to_dict() for i in range(0,len(data_list['x1'])): data_list['x1'][i] = data_list['x1'][i]*data_list['x1'][i] data = pd.DataFrame(data_list) # 数据描述 print('数据描述:\n',data.describe()) # 缺失值检验 print('缺失值检验:\n',data[data.isnull() == True].count()) data.boxplot() plt.savefig("result/线性统计/6.7/boxplot_linear.jpg") plt.title('数据特征分析') plt.show() # 相关系数矩阵 r(相关系数) = x和y的协方差/(x的标准差*y的标准差) == cov(x,y)/σx*σy # 相关系数0~0.3弱相关0.3~0.6中等程度相关0.6~1强相关 print('相关系数: ',data.corr()) # 通过加入一个参数kind='reg',seaborn可以添加一条最佳拟合直线和95%的置信带。 sns.pairplot(data, x_vars=['x1','x2'], y_vars='y', height=7, aspect=0.8,kind = 'reg') plt.savefig("result/线性统计/6.7/pairplot_linear.jpg") plt.title('不同因素影响图') plt.show() X_train,X_test,Y_train,Y_test = train_test_split(data.ix[:,:2],data.ix[:,2:3],train_size=.80) print("原始数据特征:",data.ix[:,:2].shape, ",训练数据特征:",X_train.shape, ",测试数据特征:",X_test.shape) print("原始数据标签:",data.ix[:,2:3].shape, ",训练数据标签:",Y_train.shape, ",测试数据标签:",Y_test.shape) model = LinearRegression() model.fit(X_train,Y_train) a = model.intercept_ # 截距 b = model.coef_ # 回归系数 print("最佳拟合线:截距", a, ",回归系数:",b) # R方检测 # 决定系数r平方 # 对于评估模型的精确度 # y误差平方和 = Σ(y实际值 - y预测值)^2 # y的总波动 = Σ(y实际值 - y平均值)^2 # 有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动 # 有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方 # 对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化) # 2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合 score = model.score(X_test,Y_test) print('相关系数',score) # 对线性回归进行预测 Y_pred = model.predict(X_test) print(Y_pred) # 显示图像 plt.figure() plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict") plt.plot(range(len(Y_pred)),Y_test,'r',label="test") plt.legend(loc="upper right") # 显示图中的标签 plt.xlabel("Y") plt.ylabel('the number of Y') plt.title('预测与源数据对比图') plt.savefig("result/线性统计/6.7/compare_linear.jpg") plt.show() # 残差预测值 # enumerate 函数可以把一个 list 变成索引-元素对 y_dif = [] for i in range(len(Y_pred)): y_dif.append(Y_pred[i,0]-Y_test['y'].values[i]) tmp = {'x':range(len(y_dif)),'y':y_dif} df = pd.DataFrame(tmp) sns.residplot(x="x", y="y",data=df) plt.savefig("result/线性统计/6.7/残差图1.jpg") plt.title('残差图1') plt.show() # box-cox变换 Y = data['y'].tolist() lam_best = boxcox_normmax(Y) print('lambda: ',lam_best) Y = special.boxcox1p(Y, lam_best) # box-cox变换后的回归 for i in range(len(Y)): data_list['y'][i] = Y[i] data = pd.DataFrame(data_list) data.boxplot() plt.savefig("result/线性统计/6.7/boxplot_linear_bxo_cox.jpg") plt.title('bxo-cox后的数据特征分析') plt.show() # 相关系数矩阵 r(相关系数) = x和y的协方差/(x的标准差*y的标准差) == cov(x,y)/σx*σy # 相关系数0~0.3弱相关0.3~0.6中等程度相关0.6~1强相关 print('相关系数: ',data.corr()) # 通过加入一个参数kind='reg',seaborn可以添加一条最佳拟合直线和95%的置信带。 sns.pairplot(data, x_vars=['x1','x2'], y_vars='y', height=7, aspect=0.8,kind = 'reg') plt.savefig("result/线性统计/6.7/pairplot_linear_bxo_cox.jpg") plt.title('bxo-cox后的不同因素影响图') plt.show() X_train,X_test,Y_train,Y_test = train_test_split(data.ix[:,:2],data.ix[:,2:3],train_size=.80) model = LinearRegression() model.fit(X_train,Y_train) a = model.intercept_ # 截距 b = model.coef_ # 回归系数 print("bxo-cox后的最佳拟合线:截距", a, ",回归系数:",b) # R方检测 # 决定系数r平方 # 对于评估模型的精确度 # y误差平方和 = Σ(y实际值 - y预测值)^2 # y的总波动 = Σ(y实际值 - y平均值)^2 # 有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动 # 有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方 # 对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化) # 2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合 score = model.score(X_test,Y_test) print('R方检测 ',score) # 对线性回归进行预测 Y_pred = model.predict(X_test) print(Y_pred) # 显示图像 plt.figure() plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict") plt.plot(range(len(Y_pred)),Y_test,'r',label="test") plt.legend(loc="upper right") # 显示图中的标签 plt.ylabel("Y") plt.xlabel('the number of Y') plt.title('bxo-cox后的预测与源数据对比图') plt.savefig("result/线性统计/6.7/compare_linear_box_cox.jpg") plt.show() # 残差预测值 # enumerate 函数可以把一个 list 变成索引-元素对 y_dif = [] for i in range(len(Y_pred)): y_dif.append(Y_pred[i,0]-Y_test['y'].values[i]) tmp = {'x':range(len(y_dif)),'y':y_dif} df = pd.DataFrame(tmp) sns.residplot(x="x", y="y",data=df) plt.savefig("result/线性统计/6.7/残差图_bxo_cox.jpg") plt.title('bxo-cox后的残差图') plt.show()
def feature_selection(train=pd.DataFrame(), test=pd.DataFrame()): explore_data_analysis(train) train = train[train['GrLivArea'] < 4500] train['SalePrice'] = train['SalePrice'].map(lambda x: math.log(1 + x)) y = train['SalePrice'] train_features = train.drop("SalePrice", axis=1) test_features = test features = pd.concat([train_features, test_features], sort=False) features['MSSubClass'] = features['MSSubClass'].apply(str) features['YrSold'] = features['YrSold'].astype(str) features['MoSold'] = features['MoSold'].astype(str) features['Functional'] = features['Functional'].fillna('Typ') features['Electrical'] = features['Electrical'].fillna("SBrkr") features['KitchenQual'] = features['KitchenQual'].fillna("TA") features["PoolQC"] = features["PoolQC"].fillna("None") features['Exterior1st'] = features['Exterior1st'].fillna( features['Exterior1st'].mode()[0]) features['Exterior2nd'] = features['Exterior2nd'].fillna( features['Exterior2nd'].mode()[0]) features['SaleType'] = features['SaleType'].fillna( features['SaleType'].mode()[0]) for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): features[col] = features[col].fillna(0) for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']: features[col] = features[col].fillna('None') for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): features[col] = features[col].fillna('None') features['MSZoning'] = features.groupby( 'MSSubClass')['MSZoning'].transform( lambda x: x.fillna(x.mode()[0])) # 填充出现频率最高的数(众数) object = list(features.select_dtypes(include='object').columns) features.update(features[object].fillna('None')) features['LotFrontage'] = features.groupby( 'Neighborhood')['LotFrontage'].transform( lambda x: x.fillna(x.median())) # 填充中位数 numeric_dtypes = list( features.select_dtypes(include=[ 'int16', 'int32', 'int64', 'float16', 'float32', 'float64' ]).columns) features.update(features[numeric_dtypes].fillna(0)) # 统计所有偏度 > 0.5 的features skew_features = features[numeric_dtypes].apply( lambda x: skew(x)).sort_values( ascending=False) # 默认是从小到大排序, 这里不进行排序,节约时间 high_skew = skew_features[skew_features > 0.5] skew_high_index = high_skew.index for i in skew_high_index: # 对偏度大于0.5的列进行box-cox 正态化矫正 features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1)) features = features.drop(['Utilities', 'Street', 'PoolQC'], axis=1) features[ 'YrBltAndRemod'] = features['YearBuilt'] + features['YearRemodAdd'] features['TotalSF'] = features['TotalBsmtSF'] + features[ '1stFlrSF'] + features['2ndFlrSF'] features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] + features['1stFlrSF'] + features['2ndFlrSF']) features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) + features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath'])) features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] + features['EnclosedPorch'] + features['ScreenPorch'] + features['WoodDeckSF']) features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0) features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0) features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0) features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0) features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0) # 对所有变量进行扩充处理 final_features = pd.get_dummies(features) # 分离X_train, Y_train, X_test X = final_features.iloc[:len(y), :] test = final_features.iloc[len(y):, :] # print("X shape:{}\t y shape:{}\t test shape:{}".format(X.shape, y.shape, test.shape)) overfit = [] for i in X.columns: counts = X[i].value_counts() zeros = counts.iloc[0] # 获取 i 列中的众数 if zeros / len(X[i]) * 100 > 99.94: overfit.append(i) # 占比超过 99.94 数据容易过拟合 X = X.drop(overfit, axis=1) test = test.drop(overfit, axis=1) print("X shape:{}\t y shape:{}\t test shape:{}".format( X.shape, y.shape, test.shape)) return X, y, test
def ets(y, horizon, quantile=0.95, opt_crit='lik', obj='rmse', boxcox=True): """ Calls R for each model and returns fcast, model and metrics. The model selection is based on AIC but the model params are obtained to minimize opt_crit Iterating by hand rather than setting model = 'ZZZ' because some models are ignored sometimes with 'ZZZ'. Not clear why. Assumes all the data is > 0. :param y: original time series :param boxcox: if true is boxcox :param horizon: how many points to predict :param quantile: CI for forecast error :param opt_crit: what to optimize to get the model parameters (lik, mse, sigma, ...). See R :param sel_crit: metric to use to select the best model (aic or rmse). Better model if sel_crit is smaller. :return: a dict with keys: df_out, model, params, rmse, states (the HW recursion values) """ h_r = robjects.IntVector([horizon]) lvl_r = robjects.FloatVector([quantile]) opt_r = robjects.StrVector([opt_crit]) false_r = robjects.BoolVector([False]) true_r = robjects.BoolVector([True]) null_r = robjects.NULL env = robjects.r.globalenv() # get good BoxCox values for lambda y_r = robjects.FloatVector(y) if boxcox is True: lambdas = list(sps.boxcox_normmax(y, brack=(-5, 0), method='all')) + list(sps.boxcox_normmax(y, brack=(0, 5), method='all')) lambda_r = fcast.BoxCox_lambda(y_r, method='guerrero', lower=0, upper=5)[0] lambdas.append(lambda_r) lambda_r = fcast.BoxCox_lambda(y_r, method='guerrero', lower=-5, upper=0)[0] lambdas.append(lambda_r) lambdas.append(None) lambdas = list(set(lambdas)) else: lambdas = [None] data_in = [(y_r, l) for l in lambdas] lbda_opt, rmse_opt, season_opt, yhat, yupr, ylwr, params, states, aic_opt, obj_opt = None, None, None, None, None, None, None, None, None, None for y_r, l_val in data_in: season = int(r_frequency(y_r, env)) y_r = set_tsp(y_r, season, env) season_mdls = ['M', 'A', 'N'] if (season > 1 and len(y) >= 4 * season) else ['N'] for m in itertools.product(['M', 'A'], ['N', 'M', 'A'], season_mdls): # iterate over all models model = ''.join(m) damp_list = [False, True] if m[1] != 'N' else [False] # '[false_r, true_r] if m[1] != 'N' else [false_r] # new y_r with stp attributes for dp in damp_list: d_r = robjects.BoolVector([False]) if dp is False else robjects.BoolVector([True]) with warnings.catch_warnings(): warnings.filterwarnings("ignore") try: lbda_r = null_r if l_val is None else robjects.FloatVector([l_val]) etsObj_r = fcast.ets(y_r, damped=d_r, model=model, restrict=false_r, **{'opt.crit': opt_r, 'allow.multiplicative.trend': true_r, 'lambda': lbda_r}) fcast_obj_r = fcast.forecast(etsObj_r, h=h_r, level=lvl_r, **{'find.frequency': true_r, 'lambda': lbda_r}) mdl_rmse = np.sqrt(etsObj_r.rx2('mse')[0]) mdl_aic = etsObj_r.rx2('aic')[0] mdl_obj = mdl_aic if obj == 'aic' else mdl_rmse # model selection: take the smallest value no_nans = fit_nan_check(np.array([x for x in fcast_obj_r.rx2('mean')])) if (obj_opt is None or mdl_obj < obj_opt) and no_nans: rmse_opt = mdl_rmse aic_opt = mdl_aic lbda_opt = l_val states = etsObj_r.rx2('states') params = {k: v for k, v in etsObj_r.rx2('par').items()} params['model'] = etsObj_r.rx2('method')[0] # opt_mdl params['lambda'] = l_val params['season'] = season params['type'] = 'ets' yhat = [x for x in fcast_obj_r.rx2('fitted')] + [x for x in fcast_obj_r.rx2('mean')] # append to one-step fcast the out-of-sample fcast yupr = [x for x in fcast_obj_r.rx2('fitted')] + [x for x in fcast_obj_r.rx2('upper')] ylwr = [x for x in fcast_obj_r.rx2('fitted')] + [x for x in fcast_obj_r.rx2('lower')] params['no_nans'] = no_nans my_log.debug('\tr_ets processed model::rmse: ' + str(mdl_rmse) + ' aic: ' + str(mdl_aic) + ' params: ' + str(params)) except rinterface.RRuntimeError: my_log.debug('\tr_ets failed model: ' + str(model) + ' damped: ' + str(dp) + ' lbda: ' + str(l_val)) continue my_log.debug('\tr_ets opt model: ' + str(params['model']) + ' rmse: ' + str(rmse_opt) + ' lambda: ' + str(lbda_opt)) y = list(y) + [np.nan] * horizon df = pd.DataFrame({'y': y, 'yhat': yhat, 'yupr': yupr, 'ylwr': ylwr}) return {'df_out': df, 'model': params, 'rmse': rmse_opt, 'states': states, 'aic': aic_opt}
def test_all(self): maxlog_all = stats.boxcox_normmax(self.x, method='all') assert_allclose(maxlog_all, [1.804465, 1.758101], rtol=1e-6)
def test_all(self): maxlog_all = stats.boxcox_normmax(self.x, method='all') assert_allclose(maxlog_all, [1.804465325046, 1.758101454114])
def test_pearsonr(self): maxlog = stats.boxcox_normmax(self.x) assert_allclose(maxlog, 1.804465, rtol=1e-6)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()