def draw_combined_hist(df, countries, country_names, winsorize=False): #make a combined histogram plt.cla() plt.figure(1, figsize=(5, 3)) TRIM = 0.05 if winsorize == True else 0.0 m = 0 #get the max value amongsts all series that we are going to plot for c in countries: x = df[c].dropna().values x = mstats.winsorize(x, (0, TRIM)) if max(x) > m: m = max(x) #we have the max value, now plot each series, bins are decided based on max i = 0 for c in countries: x = df[c].dropna().values x = mstats.winsorize(x, (0, TRIM)) bins = np.linspace(0, m, m) plt.hist(x, bins, alpha=0.5, label=country_names[i]) i += 1 plt.legend(loc='upper right') plt.title( 'Histogram for distribution of Starbucks stores\n across cities in a country' ) name = 'hist.png' if winsorize == False else 'winsorized_hist.png' fname = os.path.join(glob.OUTPUT_DIR_NAME, glob.EDA_DIR, 'more', name) plt.savefig(fname)
def plots(df, transformation="boxcox", fig_size=(15, 8), whis=1.5, wins=(0, 0)): features_to_plot = df.columns[df.dtypes != object] positive_features = list() for feature in features_to_plot: if (df[feature] > 0).all(): positive_features.append(feature) if transformation == "boxcox": for feature in positive_features: df[feature], _ = boxcox(winsorize(df[feature], wins)) plt.figure(figsize=fig_size) plt.subplot(1, 2, 1) sns.boxplot(df[feature], whis=whis) plt.subplot(1, 2, 2) sns.distplot(df[feature]) plt.show() elif transformation == "log": for feature in positive_features: df[feature] = np.log(winsorize(df[feature], wins)) plt.figure(figsize=fig_size) plt.subplot(1, 2, 1) sns.boxplot(df[feature]) plt.subplot(1, 2, 2) sns.distplot(df[feature]) plt.show() else: print("tranformation type should be one of these:\n*log\n*boxcox")
def get_index_list_pe_pb_date(code_list, date): '''指定日期的指数PE_PB''' ret_dict = {} df_all = get_fundamentals(query(valuation), date) # 某日所有股票 for code in code_list: stocks = get_idx_components(code, date) df = df_all[df_all['code'].isin(stocks)] # 某个指数 if len(df) > 0: # 整体法,市值加权 df = df[df.pb_ratio != 0] # 去除0 df = df[df.pe_ratio != 0] # 去除0 pe1 = sum(df.market_cap) / sum(df.market_cap / df.pe_ratio) pb1 = sum(df.market_cap) / sum(df.market_cap / df.pb_ratio) # 等权,亏损置零 pe2 = len(df) / sum(1 / df.pe_ratio[df.pe_ratio > 0]) pb2 = len(df) / sum(1 / df.pb_ratio[df.pb_ratio > 0]) # 中位数,无需预处理 pe3 = df.pe_ratio.median() pb3 = df.pb_ratio.median() # 算数平均,取分位数95%置信区间 pe4 = mean(mstats.winsorize(df.pe_ratio, limits=0.025)) pb4 = mean(mstats.winsorize(df.pb_ratio, limits=0.025)) ret_dict[code] = { 'pe1': round(pe1, 2), 'pb1': round(pb1, 2), 'pe2': round(pe2, 2), 'pb2': round(pb2, 2), 'pe3': round(pe3, 2), 'pb3': round(pb3, 2), 'pe4': round(pe4, 2), 'pb4': round(pb4, 2), } return ret_dict
def winsorize_quantTrans(self,lower = 0.10, upper = 0.10,ignore_zero = True): '''Winsorizes quantity PARAMETERS lower: lower percentile in which to convert values. All values below this threshold will be converted to the lower percentile value upper: upper percentile in which to convert values. All values above this threshold will be converted to the upper percentile value ignore_zero: winsorize on non-zero values RETURNS converts ratings to winsorized values ''' if ignore_zero: nonzero_ind = np.nonzero(self.quantity)[0] self.quantity[nonzero_ind] = mstats.winsorize(self.quantity[nonzero_ind], limits=[lower, upper]) self._quantity['Transformation'].append('Winsorized nonzeros based on limits {}, {}'.format(lower, upper)) else: self.quantity = np.array(mstats.winsorize(self.quantity,limits=[lower, upper])) self._quantity['Transformation'].append('Winsorized based on limits {}, {}'.format(lower,upper))
def test_winsorization(self): "Tests the Winsorization of the data." data = ma.array([77, 87, 88, 114, 151, 210, 219, 246, 253, 262, 296, 299, 306, 376, 428, 515, 666, 1310, 2611]) assert_almost_equal(mstats.winsorize(data, (0.2, 0.2)).var(ddof=1), 21551.4, 1) data[5] = masked winsorized = mstats.winsorize(data) assert_equal(winsorized.mask, data.mask)
def test_winsorization(self): data = ma.array([77, 87, 88,114,151,210,219,246,253,262, 296,299,306,376,428,515,666,1310,2611]) assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1), 21551.4, 1) data[5] = masked winsorized = mstats.winsorize(data) assert_equal(winsorized.mask, data.mask)
def standardize(df): ''' :param df: (pandas dataframe) dataframe with columns: years, index: permnos, entries: metric_results :return: (pandas dataframe) winsorized dataframe of z_scores ''' # Keep the middle 95% of data. Clip the rest. NaN's in data will change percentiles. winsorize(df, (0.025, 0.025), inplace=True) z_score = (df - df.mean()) / df.std() return z_score
def normalize_image_0_1_by_3channel(X, y): print('--- Normalizing by channel, expecting 3 channel input ---') assert X.shape[4] == 3 X_winsor = np.copy(X) X_winsor[:, :, :, :, 0] = mstats.winsorize(X[:, :, :, :, 0], [0, 0.1]) X_winsor[:, :, :, :, 1] = mstats.winsorize(X[:, :, :, :, 1], [0, 0.1]) X_winsor[:, :, :, :, 2] = mstats.winsorize(X[:, :, :, :, 2], [0, 0.1]) X_norm = X_winsor - X_winsor.min() X_norm = X_norm / X_winsor.std() return X_norm, y
def playerdistribution(player, quantile=0.75): jc = players21.loc[players21['Player'] == name][['GameFP/36', 'MIN']] fp = winsorize(jc['GameFP/36'], [0.05, 0.05]).mean() minutes = winsorize(jc['MIN'], [0.05, 0.05]).mean() cj = jc.T covariance = np.cov(cj) distribution = pd.DataFrame( multivariate_normal.rvs(mean=[fp, minutes], cov=covariance, size=1000)) distribution['total'] = distribution[0] / 36 * distribution[1] graph = sb.kdeplot(data=distribution['total'], fill=True) return graph, distribution['total'].quantile(quantile), covariance
def wincor(x, y, tr=.2): """ Compute the winsorized correlation between `x` and `y`. This function also returns the winsorized covariance. :param x: Pandas Series Data for group one :param y: Pandas Series Data for group two :param tr: float Proportion to winsorize (default is .2) :return: Dictionary of results cor: float Winsorized correlation nval: int Number of observations sig: float p-value wcov: float Winsorized covariance """ if type(x) is not np.ndarray: x, y=pandas_to_arrays([x, y]) m1 = np.c_[x, y] # cbind m1 = m1[~np.isnan(m1).any(axis=1)] nval = m1.shape[0] x = m1[:, 0] y = m1[:, 1] g = np.floor(tr * len(x)) xvec = winsorize(x, limits=(tr,tr)) yvec = winsorize(y, limits=(tr,tr)) wcor = np.corrcoef(xvec, yvec)[0,1] wcov = np.cov(xvec, yvec)[0,1] test = wcor * np.sqrt((len(x) - 2) / (1. - wcor ** 2)) sig = 2 * (1 - t.cdf(abs(test), len(x) - 2 * g - 2)) res={'cor': wcor, 'wcov': wcov, 'sig': sig, 'nval': nval} return res
def do_prepare_data(): main_df = pd.read_csv("tables/1_thirty_models.csv") model_keys = main_df['model_key'].unique().tolist() prepared_df = pd.DataFrame() for model_key in model_keys: df = load_data(main_df=main_df, model_key=model_key) df = fill_missing_days__and_set_datetime_index(df, start_date="2012-01-05", end_date="2016-12-28") df.to_csv("tables/_fill_days.csv") df.loc[:, 'Order_Demand'] = mstats.winsorize(df['Order_Demand'].values, limits=[0.05, 0.05]) # df['Order_Demand'] = df['Order_Demand'].apply(lambda x: np.log(x+1)) df = moving_average_imputation(df) plot(df, model_key, '2_imputation_example') df['model_key'] = df['Order_Demand'].apply(lambda x: model_key) for i in range(1, 9): df['lag_{}'.format(i)] = df['Order_Demand'].shift(i) df = df.dropna() df['Date'] = df.index prepared_df = pd.concat([prepared_df, df]) prepared_df = sqldf( "select model_key, Date, Order_Demand, lag_1, lag_2, lag_3, lag_4, lag_5, lag_6, lag_7, lag_8 from prepared_df ORDER BY 1,2", locals()) prepared_df['Date'] = pd.to_datetime(prepared_df['Date']) prepared_df.to_csv("tables/2_data_prepared.csv", index=False) prepared_df.set_index(['Date'], inplace=True) thirty_plots(df=prepared_df, filename="3_prepared_data")
def win_sor(self, limit): ''' 去极值函数''' ''' 替换极大或极小的因子值''' ''' 本质上修改了数据集''' ''' limit =0.1, 替换极大的百分之10数据和极小的百分之10数据''' self.data[self.feature] = winsorize(self.data[self.feature], limits=[limit, limit])
def future_spreads(spread_current, spread_history): norm_reference = cma.val_dict['spread_norm_yrs'] future_spreads = pd.DataFrame(spread_current).T future_spreads = future_spreads.reset_index(drop=True) # Winsorize spread norm spread_norm = [] for i in range(len(spread_history.columns)): list = mstats.winsorize(spread_history.iloc[:, i].dropna(), limits=[0.05, 0.05], inclusive=[True, True]) def Average(lst): return sum(lst) / len(lst) spread_norm.append(Average(list)) # Create shell dataframe for i in range(1, 11): future_spreads.loc[i] = np.nan # Calculate normalized spread path for i in range(norm_reference, 11): future_spreads.iloc[i, :] = spread_norm # Populate data for years leading up to normalization for i in range(1, norm_reference): future_spreads.iloc[i, :] = ( future_spreads.iloc[norm_reference, :] - future_spreads.iloc[0, :] ) / norm_reference + future_spreads.iloc[i - 1, :] return future_spreads
def cs_winFn(ser, level): indx = ser.index ser = ser.dropna() arr = array(ser) arr = winsorize(arr, limits=(level, level)) ser = Series(arr, index=ser.index) return ser.reindex(indx)
def _sub(sub): # winsorize returns an numpy array, sub is a dataframe; sub[:] replaces the "values" of the dataframe, # not the dataframe itself sub[:] = mstats.winsorize(a=sub.values, limits=winsorize_bounds, axis=0) return sub
def winvar(x, tr=.2): """ Compute the gamma Winsorized variance for the data in the vector x. tr is the amount of Winsorization which defaults to .2. Nan values are removed. :param x: :param tr: :return: """ y=winsorize(x, limits=(tr,tr)) wv = np.var(y, ddof=1) # x=x[~np.isnan(x)] # y=np.sort(x) # n=len(x) # ibot = int(np.floor(tr * n)) # itop = len(x) - ibot -1 # xbot = y[ibot] # xtop = y[itop] # y = np.where(y <= xbot, xbot, y) # y = np.where(y >= xtop, xtop, y) # wv = np.var(y, ddof=1) # DF to be consistent with Wilcox/R return wv
def get_norm_side(mean, vol, ret, z): side = winsorize(ret, limits=[0.025, 0.025]) side[(ret - mean) / np.sqrt(vol) > z] = 1 side[(ret - mean) / np.sqrt(vol) < -z] = -1 side[((ret - mean) / np.sqrt(vol) >= -z) & ((ret - mean) / np.sqrt(vol) <= z)] = 0 return side
def treat_outliers(dataframe): cols = list(dataframe) for col in cols: if col in dataframe.select_dtypes(include=np.number).columns: dataframe[col] = winsorize(dataframe[col], limits=[0.05, 0.1],inclusive=(True, True)) return dataframe
def et(ax, d): print "ET-------------" A = np.transpose(d.jx) (lines, cols) = np.shape(A) window = np.hamming(lines).reshape(lines, 1) #A *= window ny, nx = np.shape(A) #print "shape:", np.shape(A) # configuration space parameter #x = np.arange(nx)*dx x = d.x #print "x:" #print x x1 = 1 x2 = nx #temporal guiding vector #t = np.arange(ny)*dt t = d.time #print "t:" #print t t1 = 0 t2 = ny # Change to spectra by considering |F] #F = A*A #energy F = A print "min/max:", np.min(F), np.max(F) X, T = np.meshgrid(x, t) #print "nx=",nx #print "ny=",ny #print np.shape(X) #print np.shape(T) #slow (but more flexible) pcolormesh that takes guiding grid #im = ax.pcolormesh(K[w1:w2, k1:k2], W[w1:w2, k1:k2], F[w1:w2, k1:k2], # cmap='plasma', # #vmin=ff.min(), # #vmax=ff.max(), # ) F = mstats.winsorize(F, limits=[0.01, 0.01]) vminmax = np.maximum(np.abs(np.min(F)), np.abs(np.max(F))) #faster plotting with imshow im = ax.imshow(F[t1:t2, x1:x2], extent=[x[x1], x[x2 - 1], t[t1], t[t2 - 1]], origin='lower', aspect='auto', interpolation='nearest', cmap='RdYlGn', vmin=-vminmax, vmax=vminmax) return F
def tobc(curr_arr, block_size, central_tendency, cutting_ratio, percentile): answer = [] for curr_col in range(np.shape(curr_arr)[1]): curr_row = 0 curr_col_moving = curr_col sum_ = 0 median_list = [] while curr_row < np.shape(curr_arr)[0]: sum_ = sum_ + curr_arr[curr_row][curr_col_moving] median_list.append(curr_arr[curr_row][curr_col_moving]) curr_col_moving = curr_col_moving + 1 if curr_col_moving == np.shape(curr_arr)[1]: curr_col_moving = 0 curr_row = curr_row + 1 if central_tendency == "sum": temp_var = sum_ if central_tendency == "mean": temp_var = sum_ / block_size[0] if central_tendency == "median": temp_var = median(median_list) if central_tendency == "trim_mean": temp_var = stats.trim_mean(np.array(median_list), cutting_ratio) if central_tendency == "percentile": temp_var = np.percentile(np.array(median_list), percentile) if central_tendency == "win_mean": temp_var = mean(winsorize(np.array(median_list), cutting_ratio)) answer.append(temp_var) mid_answer = np.copy(circulant(answer).transpose()) return mid_answer
def preprocess(a): a = a.astype(np.float64) a[np.isinf(a)] = np.nan a = np.nan_to_num(a - np.nanmean(a)) a = winsorize(a, limits=[WIN_LIMIT, WIN_LIMIT]) return preprocessing.scale(a)
def preprocess(a): a = a.astype(np.float64) a[np.isinf(a)] = np.nan a = np.nan_to_num(a - np.nanmean(a)) a = winsorize(a, limits=[0.02,0.98]) return a
def normalize_image_0_1_by_6channel(X, y): print('--- Normalizing by channel, expecting 6 channel input ---') assert X.shape[4] == 6 X_winsor = np.copy(X) # Only winsorize across 0 1 2, since 3 4 5 is already [0 1] X_winsor[:, :, :, :, 0] = mstats.winsorize(X[:, :, :, :, 0], [0, 0.1]) X_winsor[:, :, :, :, 1] = mstats.winsorize(X[:, :, :, :, 1], [0, 0.1]) X_winsor[:, :, :, :, 2] = mstats.winsorize(X[:, :, :, :, 2], [0, 0.1]) X_norm = np.copy(X_winsor) X_norm[:, :, :, :, 0:3] = X_winsor[:, :, :, :, 0:3] - X_winsor[:, :, :, :, 0:3].min() X_norm[:, :, :, :, 0:3] = np.divide(X_norm[:, :, :, :, 0:3], X_winsor[:, :, :, :, 0:3].std(), where=X_winsor[:, :, :, :, 0:3] > 0) return X_norm, y
def vectorWinsorize(input_v, limits=0.5): ''' calculate z-score of each element in input_v to mean(input_v) ''' result = np.copy(input_v) result = winsorize(result, limits=limits) return result
def winsorize_df(self, train_df, test_df, valid_df, cols, lower, upper, test_set, valid_set): """Function to winsorize numeric values in a DataFrame to remove potential outliers Parameters ---------- train_df : pandas.DataFrame DataFrame containing the training data test_df : pandas.DataFrame DataFrame containing the test data train_df : pandas.DataFrame DataFrame containing the training data valid_df : pandas.DataFrame DataFrame containing the validation data cols : list list of columns to winsorize lower : int Lower value e.g. 0.05 will cap the data at the 5th percentile upper : int Upper value e.g. 0.05 will cap the data at the 95th percentile test_set : bool Boolean indicating if a test set has been provided valid_set : bool Boolean indicating if a validation set has been provided Returns ------- train_df : pandas.DataFrame DataFrame containing the winsorized training data test_df : pandas.DataFrame DataFrame containing the winsorized test data valid_df : pandas.DataFrame DataFrame containing the winsorized validation data """ for i in range(0, len(cols)): winsor = pd.DataFrame(winsorize(train_df[cols[i]], limits=(lower, upper))) winsor.columns = [cols[i]] winsor_min = winsor[cols[i]].min() winsor_max = winsor[cols[i]].max() # Replace the column with the winsorized version for the training data train_df.drop(cols[i], axis=1) pd.concat([train_df, winsor], axis=1) # Now replace in the test and validation sets using the values from the training data if test_set: test_df.loc[test_df[cols[i]] > winsor_max, cols[i]] = winsor_max test_df.loc[test_df[cols[i]] < winsor_min, cols[i]] = winsor_min if valid_set: valid_df.loc[valid_df[cols[i]] > winsor_max, cols[i]] = winsor_max valid_df.loc[valid_df[cols[i]] < winsor_min, cols[i]] = winsor_min return train_df, test_df, valid_df
def Winsorize(self, df, colname, tile): dfpivot2 = df[colname].astype(float).values mask = np.isnan(dfpivot2) wnp = mstats.winsorize(dfpivot2, limits=[tile, tile], axis=0, inplace=True) wnp[mask] = np.nan #Inplace true will fill all np.nan value with extreme value df[colname] = wnp return (df)
def transform(self, X): if self.method == 'winsorize': for col in X.columns.tolist(): X.loc[:, col] = mstats.winsorize(X[col], limits=(self.low, self.high)) elif self.method == 'ceilfloor': for col in X.columns.tolist(): # settingwithcopy warning X.loc[X[col] < self.low, col] = self.low X.loc[X[col] > self.high, col] = self.high return X
def transform(self, X): data = X.copy() lim_inf_1 = len(data[data['NOTA_DE']==0.0])/len(data) lim_sup_1 = len(data[data['NOTA_DE']> 10.0])/len(data) data['NOTA_DE'] = winsorize(data['NOTA_DE'], limits=[lim_inf_1, lim_sup_1] ) lim_inf_2 = len(data[data['NOTA_EM']==0.0])/len(data) lim_sup_2 = len(data[data['NOTA_EM']> 10.0])/len(data) data['NOTA_EM'] = winsorize(data['NOTA_EM'], limits=[lim_inf_2, lim_sup_2] ) lim_inf_3 = len(data[data['NOTA_MF']==0.0])/len(data) lim_sup_3 = len(data[data['NOTA_MF']> 10.0])/len(data) data['NOTA_MF'] = winsorize(data['NOTA_MF'], limits=[lim_inf_3, lim_sup_3] ) lim_inf_4 = len(data[data['NOTA_GO']==0.0])/len(data) lim_sup_4 = len(data[data['NOTA_GO']> 10.0])/len(data) data['NOTA_GO'] = winsorize(data['NOTA_GO'], limits=[lim_inf_4, lim_sup_4] ) return data
def ols_beta(df, window): model = pd.stats.ols.MovingOLS(y=df.ic, x=df[['ih']], window_type='rolling', window=window, intercept=True) df['ols_beta'] = model.beta.ih.shift(1) df['ols_r2'] = model.r2 df.ols_beta = mstats.winsorize(df.ols_beta, limits=[0.01, 0.01]) #df = df.replace([np.inf, -np.inf], np.nan).dropna() return df
def _winsorize(a, limits=None, inclusive=(True, True)): # drop masked data a1 = np.ma.compressed(a) # use .data to return an np.ndarray instead of a masked array try: wa = winsorize(a1, limits=limits, inclusive=inclusive).data except IndexError: wa = np.zeros(0, dtype=a.dtype) return wa
def fm_regression(self): data = self.cache_data.loc[:, ['forward_return'] + self.cache['factor_names']].copy() # need to winsorize for factor in self.cache['factor_names']: data[factor] = data[factor].apply(lambda x: winsorize(x, (0.25, 0.25))) # cross-sectional regression # time-series regression # test pass
def winsorize_norm_chromosome_data(read_5p_ends, chromosome, strand, genome_dict, nucs_to_count, to_winsorize = True, low = 0, high = 0.95): """ :param read_5p_ends: :param chromosome: :param strand: :param genome_dict: :param nucs_to_count: :param low: :param high: :return: an array (now zero-indexed from 1-indexed) of densities for the given chromosome on the given strand, winsorized, and only for the given nucleotides """ max_position = max(read_5p_ends[strand][chromosome].keys()) density_array =numpy.array([0] * max_position) for position in read_5p_ends[strand][chromosome].keys(): if genome_dict[chromosome][position-1] in nucs_to_count: density_array[position-1] = read_5p_ends[strand][chromosome][position] if to_winsorize: winsorize(density_array, limits = (low, 1-high), inplace = True) normed_array = density_array/float(max(density_array)) return normed_array
def normalize_dict_to_max(mutation_dict, winsorize_data = False, winsorization_limits = (0, 0.95)): all_values = [] normed_dict = {} for strand in mutation_dict: normed_dict[strand] = {} for chromosome in mutation_dict[strand]: normed_dict[strand][chromosome] = {} #print mutation_dict[strand][chromosome].values() all_values += mutation_dict[strand][chromosome].values() #print all_values if winsorize_data: winsorize(all_values, limits = (winsorization_limits[0], 1-winsorization_limits[1]), inplace = True) max_value = float(max(all_values)) for strand in mutation_dict: for chromosome in mutation_dict[strand]: for position in mutation_dict[strand][chromosome]: val = mutation_dict[strand][chromosome][position] if val < min(all_values): val = min(all_values) if val > max(all_values): val = max(all_values) normed_dict[strand][chromosome][position] = val/max_value return normed_dict
def fit_transform(self, X): self.fitted = True sizeSig = np.size(X,1)/self.cardElec for numElec in range(self.cardElec): elecSlice = slice(numElec*sizeSig,(numElec+1)*sizeSig,1) test = X[:,elecSlice] X[:,elecSlice] = winsorize(X[:,elecSlice],limits=self.limit) test = X[:,elecSlice] self.maximum = np.max(X) self.minimum = np.min(X) return X
def winsorize_series(group,trim_prop): return mstats.winsorize(group, limits=[trim_prop,trim_prop]).mean()
def winsor(s): return mstats.winsorize(s, limits=[0.01, 0.01])