class Shift(object): # frame shift speedup issue-5609 params = [0, 1] param_names = ['axis'] def setup(self, axis): self.df = DataFrame(np.random.rand(10000, 500)) def time_shift(self, axis): self.df.shift(1, axis=axis)
def test_shift_fill_value(self): # GH #24128 df = DataFrame([1, 2, 3, 4, 5], index=date_range('1/1/2000', periods=5, freq='H')) exp = DataFrame([0, 1, 2, 3, 4], index=date_range('1/1/2000', periods=5, freq='H')) result = df.shift(1, fill_value=0) assert_frame_equal(result, exp) exp = DataFrame([0, 0, 1, 2, 3], index=date_range('1/1/2000', periods=5, freq='H')) result = df.shift(2, fill_value=0) assert_frame_equal(result, exp)
def timeseries_to_supervised(data, lag=1): df = DataFrame(data) columns = [df.shift(i) for i in range(1, lag+1)] columns.append(df) df = concat(columns, axis=1) df.fillna(0, inplace=True) return df
def main(): for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/3/200','12/31/2009') price=DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume=DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) returns=(price-price.shift(1))/price
def test_shift_categorical(self): # GH 9416 s1 = pd.Series(["a", "b", "c"], dtype="category") s2 = pd.Series(["A", "B", "C"], dtype="category") df = DataFrame({"one": s1, "two": s2}) rs = df.shift(1) xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) assert_frame_equal(rs, xp)
def test_shift_bool(self): df = DataFrame({'high': [True, False], 'low': [False, False]}) rs = df.shift(1) xp = DataFrame(np.array([[np.nan, np.nan], [True, False]], dtype=object), columns=['high', 'low']) assert_frame_equal(rs, xp)
def test_shift_categorical(self): # GH 9416 s1 = pd.Series(['a', 'b', 'c'], dtype='category') s2 = pd.Series(['A', 'B', 'C'], dtype='category') df = DataFrame({'one': s1, 'two': s2}) rs = df.shift(1) xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)}) assert_frame_equal(rs, xp)
def estimateBeta(priceY, priceX, algo="standard"): """ estimate stock Y vs stock X beta using iterative linear regression. Outliers outside 3 sigma boundary are filtered out Parameters -------- priceX : price series of x (usually market) priceY : price series of y (estimate beta of this price) Returns -------- beta : stockY beta relative to stock X """ X = DataFrame({"x": priceX, "y": priceY}) if algo == "returns": ret = (X / X.shift(1) - 1).dropna().values # print len(ret) x = ret[:, 0] y = ret[:, 1] iteration = 1 nrOutliers = 1 while iteration < 10 and nrOutliers > 0: (a, b) = polyfit(x, y, 1) yf = polyval([a, b], x) # plot(x,y,'x',x,yf,'r-') err = yf - y idxOutlier = abs(err) > 3 * np.std(err) nrOutliers = sum(idxOutlier) beta = a # print 'Iteration: %i beta: %.2f outliers: %i' % (iteration,beta, nrOutliers) x = x[~idxOutlier] y = y[~idxOutlier] iteration += 1 elif algo == "log": x = np.log(X["x"]) y = np.log(X["y"]) (a, b) = polyfit(x, y, 1) beta = a elif algo == "standard": ret = np.log(X).diff().dropna() beta = ret["x"].cov(ret["y"]) / ret["x"].var() else: raise TypeError("unknown algorithm type, use 'standard', 'log' or 'returns'") return beta
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() # input sequence (t-n, ... t-1) for i in range(n_in, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)] # forecast sequence (t, t+1, ... t+n) for i in range(0, n_out): cols.append(df.shift(-i)) if i == 0: names += [('var%d(t)' % (j+1)) for j in range(n_vars)] else: names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)] # put it all together agg = concat(cols, axis=1) agg.columns = names # drop rows with NaN values if dropnan: agg.dropna(inplace=True) return agg
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() # input sequence (t-n, ... t-1) for i in range(n_in, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)] # forecast sequence (t, t+1, ... t+n) targets = df.shift(0) drop_list = [i for i in range(n_vars -1)] targets.drop(targets.columns[drop_list], axis=1, inplace=True) cols.append(targets) names += ['var%d' % (n_out -1)] #print(names) #print(cols) # put it all together agg = pd.concat(cols, axis=1) agg.columns = names # drop rows with NaN values if dropnan: agg.dropna(inplace=True) return agg
def create_dataset_for_supervised_learning(dataset, num_shift=1): dataset_for_sl = DataFrame(dataset) list_precip_hly_shifted = [ ] # lista di dataframe con sequenze temporali shiftate #for num_col in range(1, dataset.shape[1] + 1): for ns in range(1, num_shift + 1): precip_hly_shifted = dataset_for_sl.shift( ns) # shifto la sequenza temporale avanti di un ns di step #print precip_hly_shifted list_precip_hly_shifted.insert(0, precip_hly_shifted) list_precip_hly_shifted.append( dataset_for_sl) # appendo anche la sequnza originale dataset_for_sl = concat( list_precip_hly_shifted, axis=1 ) # concateno le sequenze mantenendo una unica colonna di conteggio del dataframe return dataset_for_sl
def generate_data(n_timesteps): # generate sequence sequence = generate_sequence(n_timesteps) sequence = array(sequence) # create lag df = DataFrame(sequence) df = concat([df.shift(1), df], axis=1) #将NAN值替换为-1 df.fillna(-1, inplace=True) values = df.values # specify input and output data X, y = values, values[:, 1] # reshape X = X.reshape(len(X), 2, 1) y = y.reshape(len(y), 1) return X, y
def timeseries_to_supervised(data, lag=1): df = DataFrame(data) #print(df) # 该行代码生成一个数组,将df向下移动一位作为第一个元素 # 用这种特殊的代码写法是为了生成一个数组列表的数据结构,而不是一个序列 # columns = df.shift(1)生成的是一个序列,而不是列表 columns = [df.shift(i) for i in range(1, lag + 1)] #print(columns) # df作为第二个元素 columns.append(df) # 将df.shift(1),df按列合并,构成监督型数据 df = concat(columns, axis=1) #print(df) # 将NaN位置补零 df.fillna(0, inplace=True) return df
def to_supervised(sequence, n_in, n_out): # 创建序列的滞后副本 df = DataFrame(sequence) # df = concat([df.shift(n_in - i - 1) for i in range(n_in)], axis=1) print(df) # 删除缺失数据的行 df.dropna(inplace=True) # 指定输入和输出对的列 values = df.values print(values) width = sequence.shape[1] X = values.reshape(len(values), n_in, width) y = values[:, 0:(n_out * width)].reshape(len(values), n_out, width) print(X) print(y) return X, y
def prepare_timestep_data(data, lookback=1, dropnan=True): number_of_var = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() # X sequence (t-n, ... t-1) for i in range(lookback, 0, -1): cols.append(df.shift(i)) names += [('X%d(t-%d)' % (j + 1, i)) for j in range(number_of_var)] # put it all together aggregated_data = concat(cols, axis=1) aggregated_data.columns = names # drop rows with NaN values if dropnan: aggregated_data.dropna(inplace=True) return aggregated_data
def transform(self, time_series: pd.DataFrame) -> pd.DataFrame: """Create a shifted version of ``time_series``. Parameters ---------- time_series : pd.DataFrame, shape (n_samples, 1), required The DataFrame to shift. Returns ------- time_series_t : pd.DataFrame, shape (n_samples, 1) The shifted version of the original ``time_series``. """ time_series_shifted = time_series.shift(self.shift) time_series_t = self._rename_columns(time_series_shifted) return time_series_t
def lag_var(data, n_in=1, n_out=1, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] columns_df = data.columns df = DataFrame(data) cols, names = list(), list() for i in (range(n_in, 0, -1)): cols.append(df.shift(i)) names += [(k + '_var(t-%d)' % (i)) for j, k in zip(range(n_vars), columns_df)] agg = concat(cols, axis=1) agg.columns = names if dropnan: agg.dropna(inplace=True) return agg
def clean_consecutive_duplicates( move_data: DataFrame, subset: Optional[Union[int, Text]] = None, keep: Optional[Union[Text, bool]] = 'first', inplace: Optional[bool] = False) -> Optional[DataFrame]: """ Removes consecutive duplicate rows of the Dataframe, optionally only certain columns can be consider. Parameters ---------- move_data : dataframe The input trajectory data subset : Array of strs, optional Specifies Column label or sequence of labels, considered for identifying duplicates, by default None keep : 'first', 'last', optional If keep is set as first, all the duplicates except for the first occurrence will be dropped. On the other hand if set to last, all duplicates except for the last occurrence will be dropped. If set to False, all duplicates are dropped. by default 'first' inplace : boolean, optional if set to true the original dataframe will be altered, the duplicates will be dropped in place, otherwise a copy will be returned, by default False Returns ------- DataFrame The filtered trajectories points without consecutive duplicates or None """ if keep == 'first': n = 1 else: n = -1 if subset is None: filter_ = (move_data.shift(n) != move_data).any(axis=1) else: filter_ = (move_data[subset].shift(n) != move_data[subset]).any(axis=1) return move_data.drop(index=move_data[~filter_].index, inplace=inplace)
def make_dataset(data, n_input=1, out_index=0, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = [], [] # input (t-n, ... t-1) for i in range(n_input, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] # output (t) cols.append(df[df.columns[out_index]]) names += ['result'] # combine the inputs result = concat(cols, axis=1) result.columns = names # del miss values cols if dropnan: result.dropna(inplace=True) return result
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() # forecast sequence (t, t+1, ... t+n) for i in range(0, n_out): cols.append(df.shift(-i)) if i == 0: names += [('var%d(t)' % (j+1)) for j in range(n_vars)] else: names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)] # put it all together agg = concat(cols, axis=1) agg.columns = names # drop rows with NaN values if dropnan: agg.dropna(inplace=True) return agg
def _generate_lags(df: pd.DataFrame, lags: Union[int, List[int]]) -> pd.DataFrame: df = df.copy() # Constructs variables with lagged values. if isinstance(lags, int): lags = range(lags) cols = df.columns collection = list() for L in lags: df_lagged = df.shift(L) df_lagged.columns = [x + f"_L{L}" for x in cols] collection.append(df_lagged) merged = pd.concat(collection, axis=1) cols = merged.columns merged = merged[sorted(merged.columns)] return merged
def convert_dataset(data, n_input, out_index=0, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = [], [] # input sequence (t-n,...,t-1) for i in range(n_input, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d' % (j + 1, i)) for j in range(n_vars)] # output cols.append(df[df.columns[out_index]]) names += ['results'] # concat input/output column result = concat(cols, axis=1) result.columns = names # delete nan if dropnan: result.dropna(inplace=True) return result
def to_supervised(sequence, n_in, n_out): # create lag copies of the sequence df = DataFrame(sequence) #pd.set_option('display.max_columns', 30) print 'df=',df.values.shape df = concat([df.shift(n_in-i-1) for i in range(n_in)], axis=1) # NOT CLEAR doubt t-4, t-3 ,t-2 ,t-1, t print df.head(20) # drop rows with missing values df.dropna(inplace=True) # specify columns for input and output pairs values = df.values print 'len values=',len(values) print 'values=',values.shape width = sequence.shape[1] # (25,100) => 100 X = values.reshape(len(values), n_in, width) # 21 , 5 , 100 y = values[:, 0:(n_out*width)].reshape(len(values), n_out, width) # 21 , 3 , 100 print 'X=',X,'\ny=',y return X, y
def convert_dataset(data, n_input=1, out_index=0, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = [], [] # 输入序列 (t-n, ... t-1) for i in range(n_input, 0, -1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] # 输出结果 (t) cols.append(df[df.columns[out_index]]) names += ['result'] # 合并输入输出序列 result = concat(cols, axis=1) result.columns = names # 删除包含缺失值的行 if dropnan: result.dropna(inplace=True) return result
def find_growth_index(x:pd.DataFrame,nlist=nlist): x=x.copy() x.sort_values(by='日期',ascending=False) x=x[0:2*max(nlist)].copy() x.reset_index(inplace=True) res=pd.DataFrame() for n in nlist: str1=str(n)+'日涨跌幅' res.loc[0,str1]=((x.loc[0,'收盘价'])-(x.loc[n,'收盘价']))/x.loc[n,'收盘价'] xc = x.shift(n) str2 = str(n) + '日最大涨幅'+str(2*max(nlist))+'日内' str3 = str(n) + '日最小涨幅'+str(2*max(nlist))+'日内' up_down=((xc['收盘价']-x['收盘价'])/x['收盘价'])[n:] print(up_down) print(max((up_down)),min(up_down)) res.loc[0, [str2,str3]]=[max((up_down)),min(up_down)] # res.loc[0,str2],res.loc[0,str3]= \ # max((up_down)), min(up_down) return res
def fetch_expected_weights(assets: list, d: pd.DataFrame, num_portfolios: int = 10000, rfr=0.05, no_of_days=365): returns = np.log(d / d.shift(1)) mean_returns = returns.mean() cov_matrix = returns.cov() num_assets = len(assets) port_returns = [] port_volatility = [] sharpe_ratio = [] stock_weights = [] for i in range(num_portfolios): weights = np.random.random_sample(num_assets) weights /= np.sum(weights) volatility, ret = portfolio_annualised_performance( weights, mean_returns, cov_matrix, no_of_days) stock_weights.append(weights) port_returns.append(ret) port_volatility.append(volatility) sharpe = (ret - rfr) / volatility sharpe_ratio.append(sharpe) portfolio = { 'Returns': port_returns, 'Volatility': port_volatility, 'Sharpe Ratio': sharpe_ratio } for counter, symbol in enumerate(assets): portfolio[symbol] = [Weight[counter] for Weight in stock_weights] df = pd.DataFrame(portfolio) column_order = ['Returns', 'Volatility', 'Sharpe Ratio'] column_order = column_order + [stock for stock in assets] df = df[column_order] max_sharpe = df['Sharpe Ratio'].max() min_volatility = df['Volatility'].min() max_sharpe_port = df.loc[df['Sharpe Ratio'] == max_sharpe] min_volatility_port = df.loc[df['Volatility'] == min_volatility] return get_dict_result(min_volatility_port),\ get_dict_result(max_sharpe_port)
def generate_factor(self): CLOSE = DataFrame({stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv'%(gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'close'] for stock in self.stocks}) hk = pd.read_csv('%s/StockMoneyData/HK.csv'%gc.DATABASE_PATH, index_col=[0], parse_dates=[0]) hk.fillna(method='ffill', inplace=True) hk.fillna(0, inplace=True) cols = list(filter(lambda x:x[0]=='3', hk.columns)) hk = DataFrame(hk.loc[:, cols], index=CLOSE.index, columns=cols) CLOSE = CLOSE.loc[CLOSE.index >= self.start_date, :] CLOSE = CLOSE.loc[CLOSE.index <= self.end_date, :] hk = hk.shift().loc[CLOSE.index, :] hk_hold = DataFrame(0, index=CLOSE.index, columns=self.stocks) hk_hold.loc[hk.index, hk.columns] = hk a = hk_hold * CLOSE a = a.loc[a.index >= self.start_date, :] a = a.loc[a.index <= self.end_date, :] self.factor = a
def convert_dataset(data, n_input=1, out_index=0, dropnan=True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = [], [] #input sequences (t-n,...,t-1) for i in range(n_input, 0, -1): cols.append(df.shift(i)) #shift 把数据往时间轴推后 names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] #print(cols) #output sequence t cols.append(df[df.columns[out_index]]) names += ['result'] #concatenate input and output result = concat(cols, axis=1) result.columns = names #delete rows having nan value if dropnan: result.dropna(inplace=True) return result
def _lag_df(df: pd.DataFrame, lags: Union[int, list]) -> pd.DataFrame: """ Advances in Financial Machine Learning, Snipet 17.3, page 259. Apply Lags to DataFrame :param df: (int or list) Either number of lags to use or array of specified lags :param lags: (int or list) Lag(s) to use :return: (pd.DataFrame) Dataframe with lags """ df_lagged = pd.DataFrame() if isinstance(lags, int): lags = range(1, lags + 1) else: lags = [int(lag) for lag in lags] for lag in lags: temp_df = df.shift(lag).copy(deep=True) temp_df.columns = [str(i) + '_' + str(lag) for i in temp_df.columns] df_lagged = df_lagged.join(temp_df, how='outer') return df_lagged
def test_datetime_frame_shift_with_freq(self, datetime_frame): shifted = datetime_frame.shift(1, freq="infer") unshifted = shifted.shift(-1, freq="infer") tm.assert_frame_equal(datetime_frame, unshifted) shifted2 = datetime_frame.shift(freq=datetime_frame.index.freq) tm.assert_frame_equal(shifted, shifted2) inferred_ts = DataFrame( datetime_frame.values, Index(np.asarray(datetime_frame.index)), columns=datetime_frame.columns, ) shifted = inferred_ts.shift(1, freq="infer") expected = datetime_frame.shift(1, freq="infer") expected.index = expected.index._with_freq(None) tm.assert_frame_equal(shifted, expected) unshifted = shifted.shift(-1, freq="infer") tm.assert_frame_equal(unshifted, inferred_ts)
def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) df = ser.to_frame() with tm.assert_produces_warning(FutureWarning): result = df.shift(1, fill_value=0) expected = Series([pd.Timestamp(0), ser[0]]).to_frame() tm.assert_frame_equal(result, expected) # axis = 1 df2 = DataFrame({"A": ser, "B": ser}) df2._consolidate_inplace() with tm.assert_produces_warning(FutureWarning): result = df2.shift(1, axis=1, fill_value=0) expected = DataFrame({"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]}) tm.assert_frame_equal(result, expected)
def to_supervised_Act_Per(_actList, _perList, n_in): dfAct = DataFrame(_actList) dfX = DataFrame() for i in range(0, n_in): currentVar = dfAct.shift(-i) dfX = concat([dfX, currentVar], axis=1) dfX.dropna(inplace=True) for _ in range(n_in - 1): _perList = np.delete(_perList, 0, 0) dfy = DataFrame(_perList) valuesX, valuesy = dfX.values, dfy.values X = valuesX.reshape(len(valuesX), n_in, -1) y = valuesy.reshape(len(valuesy), 1, -1) return X, y
def dataframe_to_supervised(data: pd.DataFrame, sequence_length: int = 1) -> list: """ Does the same as timeseries_to_supervised but with whole DataFrame (all columns) :param data: any dataframe :param sequence_length: how many dataframes will be in the list :return: a list of dataframes with reset index """ assert sequence_length >= 1 sequence_length = sequence_length - 1 shiftFrames = [data.shift(i) for i in range(sequence_length, 0, -1)] shiftFrames.append(data) for i in range(len(shiftFrames)): frame = shiftFrames[i] frame.drop(frame.index[:sequence_length], inplace=True) frame.reset_index(drop=True, inplace=True) # don't need to drop from the bottom (DataFrame#shift doesn't change DataFrame size) return shiftFrames
def get_quality_adjustments( quality_value: pd.DataFrame, to_reset: Optional[pd.DataFrame] = None, to_adjust: Optional[pd.DataFrame] = None, ) -> pd.DataFrame: """Return cumulative quality adjustment factors for given values. Accumulates the quality adjustments across each Feb-Jan+1 window, resetting back to no adjustment (a factor of 1) if a reset occurs. By default, adjustment factors are determined by dividing quality values by the value in the period before, but this can be subset using `to_adjust`_. Parameters ---------- quality_value : DataFrame The quality value used to calculate quality adjustments. to_reset : DataFrame Boolean mask of quality adjustments to be reset. to_adjust : DataFrame Boolean mask of values to be adjusted. Returns ------- DataFrame Cumulative adjustment factors for base prices. """ # Divide size by the period before. adjustment_factors = quality_value.div(quality_value.shift(1, axis=1)) if to_adjust is not None: adjustment_factors[~to_adjust] = 1 if to_reset is not None: # Get the inverse cumulative growth for resetting. impute_resets = get_cumulative_adjustments(adjustment_factors).pow(-1) adjustment_factors = adjustment_factors.mask(to_reset, impute_resets) # Fill data lost in first period with 1 i.e. no adjustment. return get_cumulative_adjustments(adjustment_factors).fillna(1)
def auto_cor_cov(self, data: pd.DataFrame, order: int = 2, decay: int = 2) -> pd.DataFrame: """ 矩阵与矩阵相关性计算: A = np.array([[a11,a21],[a12,a22]]) B = np.array([[b11,b21],[b12,b22]]) matrix = [[cov([a11,a21], [a11,a21]), cov([a11,a21], [a12,a22]), cov([a11,a21], [b11,b21]), cov([a11,a21], [b12,b22])], [cov([a12,a22], [a11,a21]), cov([a12,a22], [a12,a22]), cov([a12,a22], [b11,b21]), cov([a12,a22], [b12,b22])], [cov([b11,b21], [a11,a21]), cov([b11,b21], [a12,a22]), cov([b11,b21], [b11,b21]), cov([b11,b21], [b12,b22])], [cov([b12,b22], [a11,a21]), cov([b12,b22], [a12,a22]), cov([b12,b22], [b11,b21]), cov([b12,b22], [b12,b22])]] 自相关协方差矩阵为: matrix_at_cor_cov = [[cov([a11,a21], [b11,b21]), cov([a11,a21], [b12,b22])], [cov([a12,a22], [b11,b21]), cov([a12,a22], [b12,b22])] 注: 输入pd.DataFrame格式的数据计算协方差会以行为单位向量进行计算 计算出来的协方差矩阵中右上角order*order矩阵才是自相关矩阵 协方差矩阵:横向为当期与各因子滞后阶数的协方差;纵向为滞后阶数与当期各因子的协方差 :param data: :param order: :param decay: :return: """ # order matrix matrix_order = data.shift(order).dropna(axis=0, how='all') matrix = data.iloc[order:, :].copy(deep=True) w_list = self.Half_time(period=matrix.shape[0], decay=decay) w_list = sorted(w_list, reverse=False) # 升序排列 covs = np.cov(matrix.T, matrix_order.T, aweights=w_list) # 需要再测试 cov_order = pd.DataFrame(covs[:-matrix.shape[1], -matrix.shape[1]:], index=matrix.columns, columns=matrix.columns) return cov_order
def _add_dribbles(actions: pd.DataFrame) -> pd.DataFrame: next_actions = actions.shift(-1) same_team = actions.team_id == next_actions.team_id # not_clearance = actions.type_id != actiontypes.index("clearance") dx = actions.end_x - next_actions.start_x dy = actions.end_y - next_actions.start_y far_enough = dx ** 2 + dy ** 2 >= min_dribble_length ** 2 not_too_far = dx ** 2 + dy ** 2 <= max_dribble_length ** 2 dt = next_actions.time_seconds - actions.time_seconds same_phase = dt < max_dribble_duration same_period = actions.period_id == next_actions.period_id dribble_idx = same_team & far_enough & not_too_far & same_phase & same_period dribbles = pd.DataFrame() prev = actions[dribble_idx] nex = next_actions[dribble_idx] dribbles["game_id"] = nex.game_id dribbles["period_id"] = nex.period_id dribbles["action_id"] = prev.action_id + 0.1 dribbles["time_seconds"] = (prev.time_seconds + nex.time_seconds) / 2 dribbles["timestamp"] = nex.timestamp dribbles["team_id"] = nex.team_id dribbles["player_id"] = nex.player_id dribbles["start_x"] = prev.end_x dribbles["start_y"] = prev.end_y dribbles["end_x"] = nex.start_x dribbles["end_y"] = nex.start_y dribbles["bodypart_id"] = spadlconfig.bodyparts.index("foot") dribbles["type_id"] = spadlconfig.actiontypes.index("dribble") dribbles["result_id"] = spadlconfig.results.index("success") actions = pd.concat([actions, dribbles], ignore_index=True, sort=False) actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index( drop=True ) actions["action_id"] = range(len(actions)) return actions
def get_data(self): """ :return: """ date = self.date pre_date = w.tdaysoffset(self.window, date, "Period=M") pre_date = pre_date.Data[0][0].strftime("%Y-%m-%d") volitality = w.wsd(self.stockcodes, "close", pre_date, date, "Fill=Previous") if volitality.ErrorCode != 0: print("数据提取异常") raise Exception("数据提取异常") vol = DataFrame(np.array(volitality.Data).T, columns=volitality.Codes, index=volitality.Times) ret = vol / vol.shift(1) - 1 volitality = FactorsZoo.check_data( (math.sqrt(252) * ret.std()).values.tolist()) return volitality
def sentiment_stockPrice_series(sentimentScore, stockPrice, scaler): # get the sentiment score and the close price of the stock from input file sentiment = read_csv(sentimentScore, header=0, index_col=0).values stockPrice = read_csv(stockPrice, header=0, index_col=0).values print(stockPrice) # transfer all data into float32 sentiment = sentiment.astype('float32') stockPrice = stockPrice.astype('float32') #normalize features sentimentS = scaler.fit_transform(sentiment) stockPriceS = scaler.fit_transform(stockPrice) # build a supervised learning dataset score = DataFrame(sentimentS, index=[i for i in range(0, len(sentimentS))]) price = DataFrame(stockPriceS, index=[i for i in range(0, len(stockPriceS))]) Tprice = price.shift(-1) for i in range(0, Tprice.size, 1): if (Tprice.at[i, 0] - price.at[i, 0] > 0): Tprice.at[i, 0] = 1 else: Tprice.at[i, 0] = 0 cols, names = list(), list() cols.append(score) cols.append(price) cols.append(Tprice) names.append('score') names.append('price') names.append('trend') #put names and the data in cols together result = concat(cols, axis=1) result.columns = names return result
def to_supervised(data, n_in=1, n_out=1, diff_in=False, diff_out=True, drop_nan=True): ''' @CopyRight: Code is inspired by weblog of machinelearningmastery.com Copies the data columns (of an nD sequence) so that for each timestep you have a "in" seq and an "out" seq :param data: :param n_in: length of "in" seq (number of observations) :param n_out: length of "out" seq (number of predictions) :param diff_in: if True the "in" columns are differential otherwise will be absolute :param diff_out: if True the "out" columns are differential otherwise will be absolute :param drop_nan: if True eliminate the samples that contains nan (due to shift operation) :return: a table whose columns are n_in * nD (observations) and then n_out * nD (predictions) ''' n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols, names = list(), list() # input sequence (t-n, ... t-1) for i in range(n_in, 0, -1): names += [('var_in%d(t-%d)' % (j + 1, i - 1)) for j in range(n_vars)] if diff_in: cols.append(df.shift(i - 1) - df.shift(i)) else: cols.append(df.shift(i - 1)) # forecast sequence (t, t+1, ... t+n) for i in range(1, n_out + 1): names += [('var_out%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] if diff_out: cols.append(df.shift(-i) - df.shift(0)) # displacement else: cols.append(df.shift(-i)) # position # put it all together agg = concat(cols, axis=1) agg.columns = names # drop rows with NaN values if drop_nan: agg.dropna(inplace=True) return agg.values
def ha(open_, high, low, close, offset=None, **kwargs): """Candle Type: Heikin Ashi""" # Validate Arguments open_ = verify_series(open_) high = verify_series(high) low = verify_series(low) close = verify_series(close) offset = get_offset(offset) # Calculate Result m = close.size df = DataFrame({ "HA_open": 0.5 * (open_.iloc[0] + close.iloc[0]), "HA_high": high, "HA_low": low, "HA_close": 0.25 * (open_ + high + low + close), }) for i in range(1, m): df["HA_open"][i] = 0.5 * (df["HA_open"][i - 1] + df["HA_close"][i - 1]) df["HA_high"] = df[["HA_open", "HA_high", "HA_close"]].max(axis=1) df["HA_low"] = df[["HA_open", "HA_low", "HA_close"]].min(axis=1) # Offset if offset != 0: df = df.shift(offset) # Handle fills if "fillna" in kwargs: df.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: df.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it df.name = "Heikin-Ashi" df.category = "candles" return df
def test_shift(self): # naive shift shiftedFrame = self.tsframe.shift(5) self.assert_index_equal(shiftedFrame.index, self.tsframe.index) shiftedSeries = self.tsframe['A'].shift(5) assert_series_equal(shiftedFrame['A'], shiftedSeries) shiftedFrame = self.tsframe.shift(-5) self.assert_index_equal(shiftedFrame.index, self.tsframe.index) shiftedSeries = self.tsframe['A'].shift(-5) assert_series_equal(shiftedFrame['A'], shiftedSeries) # shift by 0 unshifted = self.tsframe.shift(0) assert_frame_equal(unshifted, self.tsframe) # shift by DateOffset shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay()) self.assertEqual(len(shiftedFrame), len(self.tsframe)) shiftedFrame2 = self.tsframe.shift(5, freq='B') assert_frame_equal(shiftedFrame, shiftedFrame2) d = self.tsframe.index[0] shifted_d = d + offsets.BDay(5) assert_series_equal(self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False) # shift int frame int_shifted = self.intframe.shift(1) # noqa # Shifting with PeriodIndex ps = tm.makePeriodFrame() shifted = ps.shift(1) unshifted = shifted.shift(-1) self.assert_index_equal(shifted.index, ps.index) self.assert_index_equal(unshifted.index, ps.index) tm.assert_numpy_array_equal(unshifted.iloc[:, 0].valid().values, ps.iloc[:-1, 0].values) shifted2 = ps.shift(1, 'B') shifted3 = ps.shift(1, offsets.BDay()) assert_frame_equal(shifted2, shifted3) assert_frame_equal(ps, shifted2.shift(-1, 'B')) assertRaisesRegexp(ValueError, 'does not match PeriodIndex freq', ps.shift, freq='D') # shift other axis # GH 6371 df = DataFrame(np.random.rand(10, 5)) expected = pd.concat([DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], ignore_index=True, axis=1) result = df.shift(1, axis=1) assert_frame_equal(result, expected) # shift named axis df = DataFrame(np.random.rand(10, 5)) expected = pd.concat([DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], ignore_index=True, axis=1) result = df.shift(1, axis='columns') assert_frame_equal(result, expected)
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) price.to_csv(filename) else: price = pd.read_csv(filename) price.index = [datetime.strptime(x,'%Y-%m-%d') for x in price['Date']] price = price.drop('Date',1) # Specify number of days to shift shift = 20 # Specify filter "length" filter_len = shift shift_returns = price/price.shift(shift) - 1 shift_returns_mean = pd.ewma(shift_returns,span=filter_len) shift_returns_var = pd.ewmvar(shift_returns,span=filter_len) CovSeq = pd.DataFrame() for FirstStock in np.arange(NumStocks-1): for SecondStock in np.arange(FirstStock+1,NumStocks): ColumnTitle = StockList[FirstStock] + '-' + StockList[SecondStock] CovSeq[ColumnTitle] = pd.ewmcov(shift_returns[StockList[FirstStock]],shift_returns[StockList[SecondStock]],span=filter_len) # Test CVXOPT code for a single day date = '2013-10-31' n = NumStocks+1 pbar = matrix(interest_rate,(1,n)) p2 = shift_returns_mean.ix[date] p2 = matrix(p2)
quotes = fin.quotes_historical_yahoo(symbol, start, end) dates, open, close, high, low, volume = zip(*quotes) data = {"open": open, "close": close, "high": high, "low": low, "volume": volume} dates = Index([datetime.fromordinal(int(d)) for d in dates]) return DataFrame(data, index=dates) msft = getQuotes("MSFT", startDate, endDate) aapl = getQuotes("AAPL", startDate, endDate) goog = getQuotes("GOOG", startDate, endDate) ibm = getQuotes("IBM", startDate, endDate) px = DataFrame({"MSFT": msft["close"], "IBM": ibm["close"], "GOOG": goog["close"], "AAPL": aapl["close"]}) returns = px / px.shift(1) - 1 # Select dates subIndex = ibm.index[(ibm["close"] > 95) & (ibm["close"] < 100)] msftOnSameDates = msft.reindex(subIndex) # Insert columns msft["hi-lo spread"] = msft["high"] - msft["low"] ibm["hi-lo spread"] = ibm["high"] - ibm["low"] # Aggregate monthly def toMonthly(frame, how): offset = BMonthEnd()
def test_shift_empty(self): # Regression test for #8019 df = DataFrame({'foo': []}) rs = df.shift(-1) assert_frame_equal(df, rs)
from pandas import DataFrame from pandas import concat from matplotlib import pyplot from sklearn.metrics import mean_squared_error # read data from csv_file series = read_csv('../../static/data_set.csv', nrows=2000, header=0, parse_dates=[0], index_col=0, squeeze=True) # Create lagged dataset values = DataFrame(series.values) dataframe = concat([values.shift(1), values], axis=1) dataframe.columns = ['t-1', 't+1'] print(dataframe.head(5)) # split into train and test sets X = dataframe.values train_size = int(len(X) * 0.66) train, test = X[1:train_size], X[train_size:] train_X, train_y = train[:, 0], train[:, 1] test_X, test_y = test[:, 0], test[:, 1] # persistence model def model_persistence(x): return x
def estimateBeta(priceY,priceX,algo = 'standard'): ''' estimate stock Y vs stock X beta using iterative linear regression. Outliers outside 3 sigma boundary are filtered out Parameters -------- priceX : price series of x (usually market) priceY : price series of y (estimate beta of this price) Returns -------- beta : stockY beta relative to stock X ''' X = DataFrame({'x':priceX,'y':priceY}) if algo=='returns': ret = (X/X.shift(1)-1).dropna().values #print len(ret) x = ret[:,0] y = ret[:,1] # filter high values low = np.percentile(x,20) high = np.percentile(x,80) iValid = (x>low) & (x<high) x = x[iValid] y = y[iValid] iteration = 1 nrOutliers = 1 while iteration < 10 and nrOutliers > 0 : (a,b) = polyfit(x,y,1) yf = polyval([a,b],x) #plot(x,y,'x',x,yf,'r-') err = yf-y idxOutlier = abs(err) > 3*np.std(err) nrOutliers =sum(idxOutlier) beta = a #print 'Iteration: %i beta: %.2f outliers: %i' % (iteration,beta, nrOutliers) x = x[~idxOutlier] y = y[~idxOutlier] iteration += 1 elif algo=='log': x = np.log(X['x']) y = np.log(X['y']) (a,b) = polyfit(x,y,1) beta = a elif algo=='standard': ret =np.log(X).diff().dropna() beta = ret['x'].cov(ret['y'])/ret['x'].var() else: raise TypeError("unknown algorithm type, use 'standard', 'log' or 'returns'") return beta
def clean_pw_offday(pw_offday, weeklookup, pw_slp2): ''' Clean pw_offday query without filtering out non-off-days invoice-level => day level => customer level ''' print('*'*100) print('Cleaning pw_offday query and creating summaries.') print('*'*100) deliveries = pw_offday print('\n\n\nDeclaring functions for later use.') def as400_date(dat): '''Accepts date as formatted in AS400''' dat = str(dat) dat = dat[-6:] dat = dt.date(dt.strptime(dat, '%y%m%d')) return dat def sum_digits_in_string(digit): return sum(int(x) for x in digit if x.isdigit()) print('Mapping Columns.') deliveries.rename(columns={'#MIVDT':'Date', '#MDIV#':'Division', '#MIVND':'Invoice', '#MCUS#':'CustomerId', '#MCALL':'Call', '#MPRIO':'Priority', '#MCMP':'Warehouse', 'CASES':'Cases', '#MEXT$':'Dollars', 'CSHP':'Ship', '#MSLSP':'SalespersonId', 'CADMBR':'ShipWeekPlan', 'CUDSCC':'Merchandising', 'CONPRM':'OnPremise', 'CSTDTE':'CustomerSetup', '#MCUSY':'CustomerType', 'CCUSTN':'Customer'}, inplace=True) pw_slp2.rename(columns={'S2NUM#':'SalespersonId', 'S2NAME':'Salesperson', 'S2DIVR':'SalespersonDirector'}, inplace=True) deliveries = deliveries.merge(pw_slp2, on='SalespersonId', how='left') print('Mapping Customer types.') typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\ 'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\ 'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\ 'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\ 'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\ '7':'Sports Venue'} deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map) print('Mapping Warehouse names.') whs_map = {1:'Kansas City',2:'Saint Louis',3:'Columbia',4:'Cape Girardeau', 5:'Springfield'} deliveries.Warehouse = deliveries.Warehouse.map(whs_map) print('Processing dates.') deliveries.Date = [as400_date(d) for d in deliveries.Date.astype(str).tolist()] weeklookup['Date'] = [dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist()] print('Merging on dates with week lookup.') deliveries = deliveries.merge(weeklookup, on='Date') dat = Series(deliveries.Date.tolist()) deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat]) week_plan = deliveries.ShipWeekPlan.tolist() week_shipped = deliveries.ShipWeek.tolist() print('Using custom logic to derive which days were off-day deliveries.') deliveries.Ship = del_days = [str('%07d'% int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist()] mon = Series([d[-7:][:1] for d in del_days]).map({'1':'M','0':'_'}) tue = Series([d[-6:][:1] for d in del_days]).map({'1':'T','0':'_'}) wed = Series([d[-5:][:1] for d in del_days]).map({'1':'W','0':'_'}) thu = Series([d[-4:][:1] for d in del_days]).map({'1':'R','0':'_'}) fri = Series([d[-3:][:1] for d in del_days]).map({'1':'F','0':'_'}) sat = Series([d[-2:][:1] for d in del_days]).map({'1':'S','0':'_'}) sun = Series([d[-1:][:1] for d in del_days]).map({'1':'U','0':'_'}) deliveries['DeliveryDays'] = del_days = list(itertools.chain.from_iterable([mon + tue + wed + thu + fri + sat + sun])) weekday = deliveries.Weekday = [d[:3] for d in deliveries.Weekday.astype(str).tolist()] _days = DataFrame(data={'Weekday':weekday, 'WeekPlanned':week_plan, 'WeekShipped':week_shipped, 'DelDays':del_days}) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun, day_list = _days['WeekPlanned'].tolist() _days['WeekPlanned'] = [d if d in ['A','B'] else '' for d in day_list] _week_actual = _days.WeekShipped.tolist() _week_plan = _days['WeekPlanned'] = [ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual,_days.WeekPlanned.tolist())] _days['OffWeek'] = _off_week = [p != a for p, a in zip(_week_plan, _week_actual)] off_mon = [str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday)] off_tue = [str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday)] off_wed = [str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday)] off_thu = [str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday)] off_fri = [str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday)] off_sat = [str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday)] off_sun = [str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday)] _off_days = DataFrame({'Mon':off_mon, 'Tue':off_tue, 'Wed':off_wed, 'Thu':off_thu, 'Fri':off_fri, 'Sat':off_sat, 'Sun':off_sun, 'OffWeek':_off_week, 'Weekday':weekday}) _off_days = _off_days[['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Weekday','OffWeek']] _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (_off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (_off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (_off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True) print('Check here if you suspect a bug.') #check_later = _off_days[_off_days['OffDayDelivery'] == True] print('Mapping Call Codes.') deliveries = pd.concat([deliveries,_off_days[['OffWeek','OffDayDelivery']]], axis=1) deliveries.Call = deliveries.Call.map({1:'Customer Call', 2:'ROE/EDI', 3:'Salesperson Call', 4:'Telesales'}) print('Putting Setup Date into proper date format.') setup_date = deliveries.CustomerSetup.astype(str).tolist() setup_month = Series([d.zfill(4)[:2] for d in setup_date]) setup_year = Series(["20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date]) #this_century = [int(d[-2:]) < 20 for d in setup_date] deliveries['CustomerSetup'] = c_setup = [str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)] print('Defining new customers based on whether they were setup last month or not.') if dt.now().month == 1: last_month = '12' else: last_month = str(dt.now().month - 1).zfill(2) if dt.now().month == 1: this_year = str(dt.now().year - 1) else: this_year = str(dt.now().year) m_y_cutoff = last_month + '-' + this_year deliveries['NewCustomer'] = [1 if m_y_cutoff == setup else 0 for setup in c_setup] deliveries['OffDayDeliveries'] = deliveries.OffDayDelivery.astype(int) print('Deriving number of weekly deliveries allotted to each customer.') _n_days = deliveries.Ship.astype(str).tolist() deliveries['AllottedWeeklyDeliveryDays'] = [sum_digits_in_string(n) for n in _n_days] _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist() _week_ind = deliveries['ShipWeekPlan'].tolist() deliveries['AllottedWeeklyDeliveryDays'] = [a if w not in ['A','B'] else 0.5 for a, w in zip(_allot, _week_ind)] _n_days = deliveries.set_index('CustomerId')['AllottedWeeklyDeliveryDays'].to_dict() print('\n') print('-'*100) print('\n') print('Aggregating by Day.') agg_funcs_day = {'OffDayDeliveries' : {'Count':max}, 'Date' : {'Count':len_unique}, 'Cases' : {'Sum':sum, 'Avg':np.mean}, 'Dollars' : {'Sum':sum, 'Avg':np.mean}, 'NewCustomer': lambda x: min(x)} pass_through_cols = ['CustomerId','Customer','Week','Date'] _agg_byday = DataFrame(deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(drop=False) _agg_byday = DataFrame(_agg_byday[['CustomerId','Customer','Week','Date','OffDayDeliveries','NewCustomer','Cases','Dollars']]) _agg_byday.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns] _agg_byday.columns = ['CustomerId','Customer','Week','Date','Delivery','OffDayDelivery','NewCustomer','Cases|Sum','Cases|Avg','Dollars|Sum','Dollars|Avg'] _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['CustomerId'].astype(int) _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['AllottedWeeklyDeliveryDays|Count'].map(_n_days) print('Aggregating by Week.') agg_funcs_week = {'OffDayDelivery' : {'Count':sum}, 'Delivery' : {'Count':sum}, 'NewCustomer' : lambda x: min(x)} _agg_byweek = DataFrame(_agg_byday.groupby(['CustomerId','Week']).agg(agg_funcs_week)).reset_index(drop=False) _agg_byweek.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns] print('Mapping number of deliveries to Customers.') # Map number of total deliveries each week by customer # to determine whether a customer with TWR deliveries # got TWF deliveries -- which is an off-day delivery # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total} _c = _agg_byweek['CustomerId'].astype(str).tolist() _w = _agg_byweek['Week'].astype(str).tolist() _agg_byweek['_X'] = [c + ',' + w for c,w in zip(_c,_w)] by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict() cid = _agg_byday['CustomerId'].astype(str).tolist() wkk = _agg_byday['Week'].astype(str).tolist() _agg_byday['N_DeliveriesThisWeek'] = [c + ',' + w for c, w in zip(cid, wkk)] _agg_byday['N_DeliveriesThisWeek'] = _agg_byday['N_DeliveriesThisWeek'].map(Series(by_week_map)) print('Using custom logic to define Additional Delivery Days.') addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'] ) addl_day_criteria_2 = ( _agg_byday.shift(1)['Week'] == _agg_byday['Week'] ) addl_day_criteria_3 = ( _agg_byday['OffDayDelivery'] == 1 ) addl_day_criteria_4 = ( _agg_byday['NewCustomer'] != 1 ) addl_day_criteria_5 = ( _agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count'] ) _agg_byday['AdditionalDeliveryDays'] = Series(addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int) print('Aggregating by Customer.') agg_funcs_cust = {'OffDayDelivery' : {'Count':sum}, 'Delivery' : {'Count':sum}, 'NewCustomer' : lambda x: min(x), 'AllottedWeeklyDeliveryDays|Count': lambda x: max(x), 'AdditionalDeliveryDays': lambda x: sum(x), 'Dollars|Sum':lambda x: int(sum(x)), 'Cases|Sum':lambda x: sum(x) } _agg_bycust = DataFrame(_agg_byday.groupby(['CustomerId','Customer']).agg(agg_funcs_cust)).reset_index(drop=False) _agg_bycust.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns] _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1) _agg_bycust.columns = ['AdditionalDeliveries','AllottedDeliveryDays','Cases', 'Customer','CustomerId','Deliveries','Dollars', 'NewCustomer','OffDayDeliveries'] _agg_bycust = _agg_bycust[['CustomerId','Customer','NewCustomer','AllottedDeliveryDays','Deliveries', 'OffDayDeliveries','AdditionalDeliveries','Cases','Dollars']] print('Mapping useful Customer attributes.') attr = ['CustomerId','Warehouse','OnPremise','CustomerSetup','CustomerType','ShipWeekPlan','DeliveryDays'] customer_attributes = deliveries[attr].drop_duplicates().reset_index(drop=True) _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates() _agg_bycust = _agg_bycust.sort_values(by=['AdditionalDeliveries','OffDayDeliveries'], ascending=False).reset_index(drop=True) _agg_bycust['CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries'] _agg_bycust['DollarsPerDelivery'] = round(_agg_bycust['Dollars'] / _agg_bycust['Deliveries'],2) _agg_bycust['OffDayDeliveries/Deliveries'] = round(_agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'],2) _agg_bycust['AdditionalDeliveries/Deliveries'] = round(_agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'],2) print('Mapping Tiers based on allotted delivery days.') tier_map = {0:'No Delivery Days Assigned',0.5:'Tier 4', 1:'Tier 3', 2:'Tier 2', 3:'Tier 1', 4:'Tier 1', 5:'Tier 1', 6:'Tier 1', 7:'Tier 1'} _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map) addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist() tier = _agg_bycust['Tier'].tolist() _agg_bycust['AdditionalDeliveries'] = [addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier)] _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(np.nan, '') print('Creating Overall Summary.') agg_funcs_summary = {'Deliveries':sum, 'OffDayDeliveries':sum, 'AdditionalDeliveries':sum, 'Dollars':{'Avg':np.mean}, 'Cases':{'Avg':np.mean}, 'CasesPerDelivery':{'Avg':np.mean}, 'NewCustomer':sum, 'Customer':len, 'AllottedDeliveryDays':lambda x: round(np.mean(x),1)} overall_summary = DataFrame(_agg_bycust.groupby(['Tier','Warehouse']).agg(agg_funcs_summary)) overall_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns] overall_summary = overall_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>', 'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum', 'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']] overall_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries', 'Cases|mean','CasesPerDelivery|mean','Dollars|mean'] print('Creating High-Level Summary.\n\n\n') agg_funcs_HL_summary = {'Deliveries':sum, 'OffDayDeliveries':sum, 'AdditionalDeliveries':sum, 'Dollars':{'Avg':np.mean}, 'Cases':{'Avg':np.mean}, 'CasesPerDelivery':{'Avg':np.mean}, 'NewCustomer':sum, 'Customer':len, 'AllottedDeliveryDays':lambda x: round(np.mean(x),1)} high_level_summary = DataFrame(_agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary)) high_level_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns] high_level_summary = high_level_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>', 'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum', 'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']] high_level_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries', 'Cases|mean','CasesPerDelivery|mean','Dollars|mean'] print('*'*100) print('Finished creating summaries at high level, overall, and aggregating by customer and by day.') print('*'*100) return high_level_summary, overall_summary, _agg_bycust, _agg_byday, deliveries