Esempio n. 1
0
class Shift(object):
    # frame shift speedup issue-5609
    params = [0, 1]
    param_names = ['axis']

    def setup(self, axis):
        self.df = DataFrame(np.random.rand(10000, 500))

    def time_shift(self, axis):
        self.df.shift(1, axis=axis)
Esempio n. 2
0
    def test_shift_fill_value(self):
        # GH #24128
        df = DataFrame([1, 2, 3, 4, 5],
                       index=date_range('1/1/2000', periods=5, freq='H'))
        exp = DataFrame([0, 1, 2, 3, 4],
                        index=date_range('1/1/2000', periods=5, freq='H'))
        result = df.shift(1, fill_value=0)
        assert_frame_equal(result, exp)

        exp = DataFrame([0, 0, 1, 2, 3],
                        index=date_range('1/1/2000', periods=5, freq='H'))
        result = df.shift(2, fill_value=0)
        assert_frame_equal(result, exp)
Esempio n. 3
0
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = concat(columns, axis=1)
    df.fillna(0, inplace=True)
    return df
Esempio n. 4
0
def main():
    for ticker in ['AAPL','IBM','MSFT','GOOG']:
        all_data[ticker]=web.get_data_yahoo(ticker,'1/3/200','12/31/2009')
        
    price=DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume=DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    returns=(price-price.shift(1))/price
Esempio n. 5
0
 def test_shift_categorical(self):
     # GH 9416
     s1 = pd.Series(["a", "b", "c"], dtype="category")
     s2 = pd.Series(["A", "B", "C"], dtype="category")
     df = DataFrame({"one": s1, "two": s2})
     rs = df.shift(1)
     xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)})
     assert_frame_equal(rs, xp)
Esempio n. 6
0
 def test_shift_bool(self):
     df = DataFrame({'high': [True, False],
                     'low': [False, False]})
     rs = df.shift(1)
     xp = DataFrame(np.array([[np.nan, np.nan],
                              [True, False]], dtype=object),
                    columns=['high', 'low'])
     assert_frame_equal(rs, xp)
Esempio n. 7
0
 def test_shift_categorical(self):
     # GH 9416
     s1 = pd.Series(['a', 'b', 'c'], dtype='category')
     s2 = pd.Series(['A', 'B', 'C'], dtype='category')
     df = DataFrame({'one': s1, 'two': s2})
     rs = df.shift(1)
     xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
     assert_frame_equal(rs, xp)
def estimateBeta(priceY, priceX, algo="standard"):
    """
    estimate stock Y vs stock X beta using iterative linear
    regression. Outliers outside 3 sigma boundary are filtered out
    
    Parameters
    --------
    priceX : price series of x (usually market)
    priceY : price series of y (estimate beta of this price)
    
    Returns
    --------
    beta : stockY beta relative to stock X
    """

    X = DataFrame({"x": priceX, "y": priceY})

    if algo == "returns":
        ret = (X / X.shift(1) - 1).dropna().values

        # print len(ret)

        x = ret[:, 0]
        y = ret[:, 1]

        iteration = 1
        nrOutliers = 1
        while iteration < 10 and nrOutliers > 0:
            (a, b) = polyfit(x, y, 1)
            yf = polyval([a, b], x)
            # plot(x,y,'x',x,yf,'r-')
            err = yf - y
            idxOutlier = abs(err) > 3 * np.std(err)
            nrOutliers = sum(idxOutlier)
            beta = a
            # print 'Iteration: %i beta: %.2f outliers: %i' % (iteration,beta, nrOutliers)
            x = x[~idxOutlier]
            y = y[~idxOutlier]
            iteration += 1

    elif algo == "log":
        x = np.log(X["x"])
        y = np.log(X["y"])
        (a, b) = polyfit(x, y, 1)
        beta = a

    elif algo == "standard":
        ret = np.log(X).diff().dropna()
        beta = ret["x"].cov(ret["y"]) / ret["x"].var()

    else:
        raise TypeError("unknown algorithm type, use 'standard', 'log' or 'returns'")

    return beta
Esempio n. 9
0
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
        n_vars = 1 if type(data) is list else data.shape[1]
        df = DataFrame(data)
        cols, names = list(), list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
                cols.append(df.shift(i))
                names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, n_out):
                cols.append(df.shift(-i))
                if i == 0:
                        names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
                else:
                        names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
        # put it all together
        agg = concat(cols, axis=1)
        agg.columns = names
        # drop rows with NaN values
        if dropnan:
                agg.dropna(inplace=True)
        return agg
Esempio n. 10
0
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	targets = df.shift(0)
	drop_list = [i for i in range(n_vars -1)]
	targets.drop(targets.columns[drop_list], axis=1, inplace=True)
	cols.append(targets)
	names += ['var%d' % (n_out -1)]
	#print(names)
	#print(cols)
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg
Esempio n. 11
0
def create_dataset_for_supervised_learning(dataset, num_shift=1):
    dataset_for_sl = DataFrame(dataset)
    list_precip_hly_shifted = [
    ]  # lista di dataframe con sequenze temporali shiftate
    #for num_col in range(1, dataset.shape[1] + 1):
    for ns in range(1, num_shift + 1):
        precip_hly_shifted = dataset_for_sl.shift(
            ns)  # shifto la sequenza temporale avanti di un ns di step
        #print precip_hly_shifted
        list_precip_hly_shifted.insert(0, precip_hly_shifted)
    list_precip_hly_shifted.append(
        dataset_for_sl)  # appendo anche la sequnza originale
    dataset_for_sl = concat(
        list_precip_hly_shifted, axis=1
    )  # concateno le sequenze mantenendo una unica colonna di conteggio del dataframe
    return dataset_for_sl
Esempio n. 12
0
def generate_data(n_timesteps):
	# generate sequence
	sequence = generate_sequence(n_timesteps)
	sequence = array(sequence)
	# create lag
	df = DataFrame(sequence)
	df = concat([df.shift(1), df], axis=1)
	#将NAN值替换为-1
	df.fillna(-1, inplace=True)
	values = df.values
	# specify input and output data
	X, y = values, values[:, 1]
	# reshape
	X = X.reshape(len(X), 2, 1)
	y = y.reshape(len(y), 1)
	return X, y
Esempio n. 13
0
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    #print(df)
    # 该行代码生成一个数组,将df向下移动一位作为第一个元素
    # 用这种特殊的代码写法是为了生成一个数组列表的数据结构,而不是一个序列
    # columns = df.shift(1)生成的是一个序列,而不是列表
    columns = [df.shift(i) for i in range(1, lag + 1)]
    #print(columns)

    # df作为第二个元素
    columns.append(df)
    # 将df.shift(1),df按列合并,构成监督型数据
    df = concat(columns, axis=1)
    #print(df)
    # 将NaN位置补零
    df.fillna(0, inplace=True)
    return df
Esempio n. 14
0
def to_supervised(sequence, n_in, n_out):
    # 创建序列的滞后副本
    df = DataFrame(sequence)
    #
    df = concat([df.shift(n_in - i - 1) for i in range(n_in)], axis=1)
    print(df)
    # 删除缺失数据的行
    df.dropna(inplace=True)
    # 指定输入和输出对的列
    values = df.values
    print(values)
    width = sequence.shape[1]
    X = values.reshape(len(values), n_in, width)
    y = values[:, 0:(n_out * width)].reshape(len(values), n_out, width)
    print(X)
    print(y)
    return X, y
Esempio n. 15
0
def prepare_timestep_data(data, lookback=1, dropnan=True):
    number_of_var = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # X sequence (t-n, ... t-1)
    for i in range(lookback, 0, -1):
        cols.append(df.shift(i))
        names += [('X%d(t-%d)' % (j + 1, i)) for j in range(number_of_var)]

    # put it all together
    aggregated_data = concat(cols, axis=1)
    aggregated_data.columns = names
    # drop rows with NaN values
    if dropnan:
        aggregated_data.dropna(inplace=True)

    return aggregated_data
    def transform(self, time_series: pd.DataFrame) -> pd.DataFrame:
        """Create a shifted version of ``time_series``.

        Parameters
        ----------
        time_series : pd.DataFrame, shape (n_samples, 1), required
            The DataFrame to shift.

        Returns
        -------
        time_series_t : pd.DataFrame, shape (n_samples, 1)
            The shifted version of the original ``time_series``.

        """
        time_series_shifted = time_series.shift(self.shift)
        time_series_t = self._rename_columns(time_series_shifted)
        return time_series_t
    def lag_var(data, n_in=1, n_out=1, dropnan=True):
        n_vars = 1 if type(data) is list else data.shape[1]
        columns_df = data.columns
        df = DataFrame(data)
        cols, names = list(), list()

        for i in (range(n_in, 0, -1)):
            cols.append(df.shift(i))
            names += [(k + '_var(t-%d)' % (i))
                      for j, k in zip(range(n_vars), columns_df)]

        agg = concat(cols, axis=1)
        agg.columns = names

        if dropnan:
            agg.dropna(inplace=True)
        return agg
Esempio n. 18
0
def clean_consecutive_duplicates(
        move_data: DataFrame,
        subset: Optional[Union[int, Text]] = None,
        keep: Optional[Union[Text, bool]] = 'first',
        inplace: Optional[bool] = False) -> Optional[DataFrame]:
    """
    Removes consecutive duplicate rows of the Dataframe, optionally only
    certain columns can be consider.

    Parameters
    ----------
    move_data : dataframe
        The input trajectory data
    subset : Array of strs, optional
        Specifies  Column label or sequence of labels, considered for
        identifying duplicates, by default None
    keep : 'first', 'last', optional
        If keep is set as first, all the duplicates except for
        the first occurrence will be dropped.
        On the other hand if set to last, all duplicates except for
        the last occurrence will be dropped.
        If set to False, all duplicates are dropped.
        by default 'first'
    inplace : boolean, optional
        if set to true the original dataframe will be altered,
        the duplicates will be dropped in place,
        otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        The filtered trajectories points without consecutive duplicates or None

    """

    if keep == 'first':
        n = 1
    else:
        n = -1
    if subset is None:
        filter_ = (move_data.shift(n) != move_data).any(axis=1)
    else:
        filter_ = (move_data[subset].shift(n) != move_data[subset]).any(axis=1)

    return move_data.drop(index=move_data[~filter_].index, inplace=inplace)
Esempio n. 19
0
def make_dataset(data, n_input=1, out_index=0, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = [], []
    # input (t-n, ... t-1)
    for i in range(n_input, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
    # output (t)
    cols.append(df[df.columns[out_index]])
    names += ['result']
    # combine the inputs
    result = concat(cols, axis=1)
    result.columns = names
    # del miss values cols
    if dropnan:
        result.dropna(inplace=True)
    return result
Esempio n. 20
0
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg
Esempio n. 21
0
def _generate_lags(df: pd.DataFrame, lags: Union[int,
                                                 List[int]]) -> pd.DataFrame:
    df = df.copy()
    # Constructs variables with lagged values.
    if isinstance(lags, int):
        lags = range(lags)
    cols = df.columns

    collection = list()

    for L in lags:
        df_lagged = df.shift(L)
        df_lagged.columns = [x + f"_L{L}" for x in cols]
        collection.append(df_lagged)
    merged = pd.concat(collection, axis=1)
    cols = merged.columns
    merged = merged[sorted(merged.columns)]
    return merged
def convert_dataset(data, n_input, out_index=0, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = [], []
    # input sequence (t-n,...,t-1)
    for i in range(n_input, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d' % (j + 1, i)) for j in range(n_vars)]
    # output
    cols.append(df[df.columns[out_index]])
    names += ['results']
    # concat input/output column
    result = concat(cols, axis=1)
    result.columns = names
    # delete nan
    if dropnan:
        result.dropna(inplace=True)
    return result
Esempio n. 23
0
def to_supervised(sequence, n_in, n_out):
	# create lag copies of the sequence
	df = DataFrame(sequence)
	#pd.set_option('display.max_columns', 30)
	print 'df=',df.values.shape
	df = concat([df.shift(n_in-i-1) for i in range(n_in)], axis=1) # NOT CLEAR doubt   t-4, t-3 ,t-2 ,t-1, t 
	print df.head(20)
	# drop rows with missing values
	df.dropna(inplace=True)
	# specify columns for input and output pairs
	values = df.values
	print 'len values=',len(values)
	print 'values=',values.shape
	width = sequence.shape[1] #  (25,100)  => 100
	X = values.reshape(len(values), n_in, width)  # 21 , 5 , 100 
	y = values[:, 0:(n_out*width)].reshape(len(values), n_out, width) # 21 , 3 , 100 
	print 'X=',X,'\ny=',y
	return X, y
Esempio n. 24
0
def convert_dataset(data, n_input=1, out_index=0, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = [], []
    # 输入序列 (t-n, ... t-1)
    for i in range(n_input, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
    # 输出结果 (t)
    cols.append(df[df.columns[out_index]])
    names += ['result']
    # 合并输入输出序列
    result = concat(cols, axis=1)
    result.columns = names
    # 删除包含缺失值的行
    if dropnan:
        result.dropna(inplace=True)
    return result
Esempio n. 25
0
 def find_growth_index(x:pd.DataFrame,nlist=nlist):
     x=x.copy()
     x.sort_values(by='日期',ascending=False)
     x=x[0:2*max(nlist)].copy()
     x.reset_index(inplace=True)
     res=pd.DataFrame()
     for n in nlist:
         str1=str(n)+'日涨跌幅'
         res.loc[0,str1]=((x.loc[0,'收盘价'])-(x.loc[n,'收盘价']))/x.loc[n,'收盘价']
         xc = x.shift(n)
         str2 = str(n) + '日最大涨幅'+str(2*max(nlist))+'日内'
         str3 = str(n) + '日最小涨幅'+str(2*max(nlist))+'日内'
         up_down=((xc['收盘价']-x['收盘价'])/x['收盘价'])[n:]
         print(up_down)
         print(max((up_down)),min(up_down))
         res.loc[0, [str2,str3]]=[max((up_down)),min(up_down)]
         # res.loc[0,str2],res.loc[0,str3]= \
         #     max((up_down)), min(up_down)
     return res
Esempio n. 26
0
def fetch_expected_weights(assets: list,
                           d: pd.DataFrame,
                           num_portfolios: int = 10000,
                           rfr=0.05,
                           no_of_days=365):

    returns = np.log(d / d.shift(1))
    mean_returns = returns.mean()
    cov_matrix = returns.cov()
    num_assets = len(assets)
    port_returns = []
    port_volatility = []
    sharpe_ratio = []
    stock_weights = []

    for i in range(num_portfolios):
        weights = np.random.random_sample(num_assets)
        weights /= np.sum(weights)
        volatility, ret = portfolio_annualised_performance(
            weights, mean_returns, cov_matrix, no_of_days)
        stock_weights.append(weights)
        port_returns.append(ret)
        port_volatility.append(volatility)
        sharpe = (ret - rfr) / volatility
        sharpe_ratio.append(sharpe)

    portfolio = {
        'Returns': port_returns,
        'Volatility': port_volatility,
        'Sharpe Ratio': sharpe_ratio
    }
    for counter, symbol in enumerate(assets):
        portfolio[symbol] = [Weight[counter] for Weight in stock_weights]
    df = pd.DataFrame(portfolio)
    column_order = ['Returns', 'Volatility', 'Sharpe Ratio']
    column_order = column_order + [stock for stock in assets]
    df = df[column_order]
    max_sharpe = df['Sharpe Ratio'].max()
    min_volatility = df['Volatility'].min()
    max_sharpe_port = df.loc[df['Sharpe Ratio'] == max_sharpe]
    min_volatility_port = df.loc[df['Volatility'] == min_volatility]
    return get_dict_result(min_volatility_port),\
        get_dict_result(max_sharpe_port)
Esempio n. 27
0
 def generate_factor(self):
     CLOSE = DataFrame({stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv'%(gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'close'] for stock in self.stocks})
     
     hk = pd.read_csv('%s/StockMoneyData/HK.csv'%gc.DATABASE_PATH, index_col=[0], parse_dates=[0])
     hk.fillna(method='ffill', inplace=True)
     hk.fillna(0, inplace=True)
     cols = list(filter(lambda x:x[0]=='3', hk.columns))
     hk = DataFrame(hk.loc[:, cols], index=CLOSE.index, columns=cols)
     
     CLOSE = CLOSE.loc[CLOSE.index >= self.start_date, :]
     CLOSE = CLOSE.loc[CLOSE.index <= self.end_date, :]
     
     hk = hk.shift().loc[CLOSE.index, :]
     hk_hold = DataFrame(0, index=CLOSE.index, columns=self.stocks)
     hk_hold.loc[hk.index, hk.columns] = hk
     a = hk_hold * CLOSE
     a = a.loc[a.index >= self.start_date, :]
     a = a.loc[a.index <= self.end_date, :]
     self.factor = a
Esempio n. 28
0
def convert_dataset(data, n_input=1, out_index=0, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = [], []
    #input sequences (t-n,...,t-1)
    for i in range(n_input, 0, -1):
        cols.append(df.shift(i))  #shift 把数据往时间轴推后
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
    #print(cols)
    #output sequence t
    cols.append(df[df.columns[out_index]])
    names += ['result']
    #concatenate input and output
    result = concat(cols, axis=1)
    result.columns = names
    #delete rows having nan value
    if dropnan:
        result.dropna(inplace=True)
    return result
Esempio n. 29
0
def _lag_df(df: pd.DataFrame, lags: Union[int, list]) -> pd.DataFrame:
    """
    Advances in Financial Machine Learning, Snipet 17.3, page 259.
    Apply Lags to DataFrame
    :param df: (int or list) Either number of lags to use or array of specified lags
    :param lags: (int or list) Lag(s) to use
    :return: (pd.DataFrame) Dataframe with lags
    """
    df_lagged = pd.DataFrame()
    if isinstance(lags, int):
        lags = range(1, lags + 1)
    else:
        lags = [int(lag) for lag in lags]

    for lag in lags:
        temp_df = df.shift(lag).copy(deep=True)
        temp_df.columns = [str(i) + '_' + str(lag) for i in temp_df.columns]
        df_lagged = df_lagged.join(temp_df, how='outer')
    return df_lagged
Esempio n. 30
0
    def test_datetime_frame_shift_with_freq(self, datetime_frame):
        shifted = datetime_frame.shift(1, freq="infer")
        unshifted = shifted.shift(-1, freq="infer")
        tm.assert_frame_equal(datetime_frame, unshifted)

        shifted2 = datetime_frame.shift(freq=datetime_frame.index.freq)
        tm.assert_frame_equal(shifted, shifted2)

        inferred_ts = DataFrame(
            datetime_frame.values,
            Index(np.asarray(datetime_frame.index)),
            columns=datetime_frame.columns,
        )
        shifted = inferred_ts.shift(1, freq="infer")
        expected = datetime_frame.shift(1, freq="infer")
        expected.index = expected.index._with_freq(None)
        tm.assert_frame_equal(shifted, expected)

        unshifted = shifted.shift(-1, freq="infer")
        tm.assert_frame_equal(unshifted, inferred_ts)
Esempio n. 31
0
    def test_shift_dt64values_int_fill_deprecated(self):
        # GH#31971
        ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")])
        df = ser.to_frame()

        with tm.assert_produces_warning(FutureWarning):
            result = df.shift(1, fill_value=0)

        expected = Series([pd.Timestamp(0), ser[0]]).to_frame()
        tm.assert_frame_equal(result, expected)

        # axis = 1
        df2 = DataFrame({"A": ser, "B": ser})
        df2._consolidate_inplace()

        with tm.assert_produces_warning(FutureWarning):
            result = df2.shift(1, axis=1, fill_value=0)

        expected = DataFrame({"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]})
        tm.assert_frame_equal(result, expected)
Esempio n. 32
0
def to_supervised_Act_Per(_actList, _perList, n_in):
    dfAct = DataFrame(_actList)

    dfX = DataFrame()
    for i in range(0, n_in):
        currentVar = dfAct.shift(-i)
        dfX = concat([dfX, currentVar], axis=1)
    dfX.dropna(inplace=True)

    for _ in range(n_in - 1):
        _perList = np.delete(_perList, 0, 0)

    dfy = DataFrame(_perList)

    valuesX, valuesy = dfX.values, dfy.values

    X = valuesX.reshape(len(valuesX), n_in, -1)
    y = valuesy.reshape(len(valuesy), 1, -1)

    return X, y
Esempio n. 33
0
def dataframe_to_supervised(data: pd.DataFrame,
                            sequence_length: int = 1) -> list:
    """
    Does the same as timeseries_to_supervised but with whole DataFrame (all columns)

    :param data: any dataframe
    :param sequence_length: how many dataframes will be in the list
    :return: a list of dataframes with reset index
    """
    assert sequence_length >= 1
    sequence_length = sequence_length - 1

    shiftFrames = [data.shift(i) for i in range(sequence_length, 0, -1)]
    shiftFrames.append(data)
    for i in range(len(shiftFrames)):
        frame = shiftFrames[i]
        frame.drop(frame.index[:sequence_length], inplace=True)
        frame.reset_index(drop=True, inplace=True)
        # don't need to drop from the bottom (DataFrame#shift doesn't change DataFrame size)
    return shiftFrames
def get_quality_adjustments(
    quality_value: pd.DataFrame,
    to_reset: Optional[pd.DataFrame] = None,
    to_adjust: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    """Return cumulative quality adjustment factors for given values.

    Accumulates the quality adjustments across each Feb-Jan+1 window,
    resetting back to no adjustment (a factor of 1) if a reset occurs.
    By default, adjustment factors are determined by dividing quality
    values by the value in the period before, but this can be subset
    using `to_adjust`_.

    Parameters
    ----------
    quality_value : DataFrame
        The quality value used to calculate quality adjustments.
    to_reset : DataFrame
        Boolean mask of quality adjustments to be reset.
    to_adjust : DataFrame
        Boolean mask of values to be adjusted.

    Returns
    -------
    DataFrame
        Cumulative adjustment factors for base prices.

    """
    # Divide size by the period before.
    adjustment_factors = quality_value.div(quality_value.shift(1, axis=1))

    if to_adjust is not None:
        adjustment_factors[~to_adjust] = 1

    if to_reset is not None:
        # Get the inverse cumulative growth for resetting.
        impute_resets = get_cumulative_adjustments(adjustment_factors).pow(-1)
        adjustment_factors = adjustment_factors.mask(to_reset, impute_resets)

    # Fill data lost in first period with 1 i.e. no adjustment.
    return get_cumulative_adjustments(adjustment_factors).fillna(1)
Esempio n. 35
0
    def auto_cor_cov(self,
                     data: pd.DataFrame,
                     order: int = 2,
                     decay: int = 2) -> pd.DataFrame:
        """
        矩阵与矩阵相关性计算:
        A = np.array([[a11,a21],[a12,a22]])
        B = np.array([[b11,b21],[b12,b22]])

        matrix = [[cov([a11,a21], [a11,a21]), cov([a11,a21], [a12,a22]), cov([a11,a21], [b11,b21]), cov([a11,a21], [b12,b22])],
                  [cov([a12,a22], [a11,a21]), cov([a12,a22], [a12,a22]), cov([a12,a22], [b11,b21]), cov([a12,a22], [b12,b22])],
                  [cov([b11,b21], [a11,a21]), cov([b11,b21], [a12,a22]), cov([b11,b21], [b11,b21]), cov([b11,b21], [b12,b22])],
                  [cov([b12,b22], [a11,a21]), cov([b12,b22], [a12,a22]), cov([b12,b22], [b11,b21]), cov([b12,b22], [b12,b22])]]

        自相关协方差矩阵为:
        matrix_at_cor_cov = [[cov([a11,a21], [b11,b21]), cov([a11,a21], [b12,b22])],
                             [cov([a12,a22], [b11,b21]), cov([a12,a22], [b12,b22])]

        注:
        输入pd.DataFrame格式的数据计算协方差会以行为单位向量进行计算
        计算出来的协方差矩阵中右上角order*order矩阵才是自相关矩阵
        协方差矩阵:横向为当期与各因子滞后阶数的协方差;纵向为滞后阶数与当期各因子的协方差
        :param data:
        :param order:
        :param decay:
        :return:
        """

        # order matrix
        matrix_order = data.shift(order).dropna(axis=0, how='all')
        matrix = data.iloc[order:, :].copy(deep=True)

        w_list = self.Half_time(period=matrix.shape[0], decay=decay)
        w_list = sorted(w_list, reverse=False)  # 升序排列

        covs = np.cov(matrix.T, matrix_order.T, aweights=w_list)  # 需要再测试
        cov_order = pd.DataFrame(covs[:-matrix.shape[1], -matrix.shape[1]:],
                                 index=matrix.columns,
                                 columns=matrix.columns)

        return cov_order
Esempio n. 36
0
def _add_dribbles(actions: pd.DataFrame) -> pd.DataFrame:
    next_actions = actions.shift(-1)

    same_team = actions.team_id == next_actions.team_id
    # not_clearance = actions.type_id != actiontypes.index("clearance")

    dx = actions.end_x - next_actions.start_x
    dy = actions.end_y - next_actions.start_y
    far_enough = dx ** 2 + dy ** 2 >= min_dribble_length ** 2
    not_too_far = dx ** 2 + dy ** 2 <= max_dribble_length ** 2

    dt = next_actions.time_seconds - actions.time_seconds
    same_phase = dt < max_dribble_duration
    same_period = actions.period_id == next_actions.period_id

    dribble_idx = same_team & far_enough & not_too_far & same_phase & same_period

    dribbles = pd.DataFrame()
    prev = actions[dribble_idx]
    nex = next_actions[dribble_idx]
    dribbles["game_id"] = nex.game_id
    dribbles["period_id"] = nex.period_id
    dribbles["action_id"] = prev.action_id + 0.1
    dribbles["time_seconds"] = (prev.time_seconds + nex.time_seconds) / 2
    dribbles["timestamp"] = nex.timestamp
    dribbles["team_id"] = nex.team_id
    dribbles["player_id"] = nex.player_id
    dribbles["start_x"] = prev.end_x
    dribbles["start_y"] = prev.end_y
    dribbles["end_x"] = nex.start_x
    dribbles["end_y"] = nex.start_y
    dribbles["bodypart_id"] = spadlconfig.bodyparts.index("foot")
    dribbles["type_id"] = spadlconfig.actiontypes.index("dribble")
    dribbles["result_id"] = spadlconfig.results.index("success")

    actions = pd.concat([actions, dribbles], ignore_index=True, sort=False)
    actions = actions.sort_values(["game_id", "period_id", "action_id"]).reset_index(
        drop=True
    )
    actions["action_id"] = range(len(actions))
    return actions
Esempio n. 37
0
        def get_data(self):
            """

            :return:
            """
            date = self.date
            pre_date = w.tdaysoffset(self.window, date, "Period=M")
            pre_date = pre_date.Data[0][0].strftime("%Y-%m-%d")
            volitality = w.wsd(self.stockcodes, "close", pre_date, date,
                               "Fill=Previous")
            if volitality.ErrorCode != 0:
                print("数据提取异常")
                raise Exception("数据提取异常")
            vol = DataFrame(np.array(volitality.Data).T,
                            columns=volitality.Codes,
                            index=volitality.Times)
            ret = vol / vol.shift(1) - 1
            volitality = FactorsZoo.check_data(
                (math.sqrt(252) * ret.std()).values.tolist())

            return volitality
Esempio n. 38
0
def sentiment_stockPrice_series(sentimentScore, stockPrice, scaler):
    # get the sentiment score and the close price of the stock from input file
    sentiment = read_csv(sentimentScore, header=0, index_col=0).values
    stockPrice = read_csv(stockPrice, header=0, index_col=0).values
    print(stockPrice)
    # transfer all data into float32
    sentiment = sentiment.astype('float32')
    stockPrice = stockPrice.astype('float32')
    #normalize features
    sentimentS = scaler.fit_transform(sentiment)
    stockPriceS = scaler.fit_transform(stockPrice)

    # build a supervised learning dataset

    score = DataFrame(sentimentS, index=[i for i in range(0, len(sentimentS))])
    price = DataFrame(stockPriceS,
                      index=[i for i in range(0, len(stockPriceS))])
    Tprice = price.shift(-1)

    for i in range(0, Tprice.size, 1):
        if (Tprice.at[i, 0] - price.at[i, 0] > 0):
            Tprice.at[i, 0] = 1
        else:
            Tprice.at[i, 0] = 0

    cols, names = list(), list()

    cols.append(score)
    cols.append(price)
    cols.append(Tprice)

    names.append('score')
    names.append('price')
    names.append('trend')

    #put names and the data in cols together
    result = concat(cols, axis=1)
    result.columns = names

    return result
Esempio n. 39
0
def to_supervised(data,
                  n_in=1,
                  n_out=1,
                  diff_in=False,
                  diff_out=True,
                  drop_nan=True):
    '''
    @CopyRight: Code is inspired by weblog of machinelearningmastery.com
    Copies the data columns (of an nD sequence) so that for each timestep you have a "in" seq and an "out" seq
    :param data:
    :param n_in: length of "in" seq (number of observations)
    :param n_out: length of "out" seq (number of predictions)
    :param diff_in: if True the "in" columns are differential otherwise will be absolute
    :param diff_out: if True the "out" columns are differential otherwise will be absolute
    :param drop_nan: if True eliminate the samples that contains nan (due to shift operation)
    :return: a table whose columns are n_in * nD (observations) and then n_out * nD (predictions)
    '''

    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        names += [('var_in%d(t-%d)' % (j + 1, i - 1)) for j in range(n_vars)]
        if diff_in:
            cols.append(df.shift(i - 1) - df.shift(i))
        else:
            cols.append(df.shift(i - 1))

    # forecast sequence (t, t+1, ... t+n)
    for i in range(1, n_out + 1):
        names += [('var_out%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
        if diff_out:
            cols.append(df.shift(-i) - df.shift(0))  # displacement
        else:
            cols.append(df.shift(-i))  # position

    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names

    # drop rows with NaN values
    if drop_nan:
        agg.dropna(inplace=True)

    return agg.values
Esempio n. 40
0
def ha(open_, high, low, close, offset=None, **kwargs):
    """Candle Type: Heikin Ashi"""
    # Validate Arguments
    open_ = verify_series(open_)
    high = verify_series(high)
    low = verify_series(low)
    close = verify_series(close)
    offset = get_offset(offset)

    # Calculate Result
    m = close.size
    df = DataFrame({
        "HA_open": 0.5 * (open_.iloc[0] + close.iloc[0]),
        "HA_high": high,
        "HA_low": low,
        "HA_close": 0.25 * (open_ + high + low + close),
    })

    for i in range(1, m):
        df["HA_open"][i] = 0.5 * (df["HA_open"][i - 1] + df["HA_close"][i - 1])

    df["HA_high"] = df[["HA_open", "HA_high", "HA_close"]].max(axis=1)
    df["HA_low"] = df[["HA_open", "HA_low", "HA_close"]].min(axis=1)

    # Offset
    if offset != 0:
        df = df.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        df.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        df.fillna(method=kwargs["fill_method"], inplace=True)

    # Name and Categorize it
    df.name = "Heikin-Ashi"
    df.category = "candles"

    return df
Esempio n. 41
0
    def test_shift(self):
        # naive shift
        shiftedFrame = self.tsframe.shift(5)
        self.assert_index_equal(shiftedFrame.index, self.tsframe.index)

        shiftedSeries = self.tsframe['A'].shift(5)
        assert_series_equal(shiftedFrame['A'], shiftedSeries)

        shiftedFrame = self.tsframe.shift(-5)
        self.assert_index_equal(shiftedFrame.index, self.tsframe.index)

        shiftedSeries = self.tsframe['A'].shift(-5)
        assert_series_equal(shiftedFrame['A'], shiftedSeries)

        # shift by 0
        unshifted = self.tsframe.shift(0)
        assert_frame_equal(unshifted, self.tsframe)

        # shift by DateOffset
        shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
        self.assertEqual(len(shiftedFrame), len(self.tsframe))

        shiftedFrame2 = self.tsframe.shift(5, freq='B')
        assert_frame_equal(shiftedFrame, shiftedFrame2)

        d = self.tsframe.index[0]
        shifted_d = d + offsets.BDay(5)
        assert_series_equal(self.tsframe.xs(d),
                            shiftedFrame.xs(shifted_d), check_names=False)

        # shift int frame
        int_shifted = self.intframe.shift(1)  # noqa

        # Shifting with PeriodIndex
        ps = tm.makePeriodFrame()
        shifted = ps.shift(1)
        unshifted = shifted.shift(-1)
        self.assert_index_equal(shifted.index, ps.index)
        self.assert_index_equal(unshifted.index, ps.index)
        tm.assert_numpy_array_equal(unshifted.iloc[:, 0].valid().values,
                                    ps.iloc[:-1, 0].values)

        shifted2 = ps.shift(1, 'B')
        shifted3 = ps.shift(1, offsets.BDay())
        assert_frame_equal(shifted2, shifted3)
        assert_frame_equal(ps, shifted2.shift(-1, 'B'))

        assertRaisesRegexp(ValueError, 'does not match PeriodIndex freq',
                           ps.shift, freq='D')

        # shift other axis
        # GH 6371
        df = DataFrame(np.random.rand(10, 5))
        expected = pd.concat([DataFrame(np.nan, index=df.index,
                                        columns=[0]),
                              df.iloc[:, 0:-1]],
                             ignore_index=True, axis=1)
        result = df.shift(1, axis=1)
        assert_frame_equal(result, expected)

        # shift named axis
        df = DataFrame(np.random.rand(10, 5))
        expected = pd.concat([DataFrame(np.nan, index=df.index,
                                        columns=[0]),
                              df.iloc[:, 0:-1]],
                             ignore_index=True, axis=1)
        result = df.shift(1, axis='columns')
        assert_frame_equal(result, expected)
Esempio n. 42
0
	volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})

	price.to_csv(filename)

else:
	price = pd.read_csv(filename)
	price.index = [datetime.strptime(x,'%Y-%m-%d') for x in price['Date']]
	price = price.drop('Date',1)


# Specify number of days to shift
shift = 20
# Specify filter "length"
filter_len = shift

shift_returns = price/price.shift(shift) - 1
shift_returns_mean = pd.ewma(shift_returns,span=filter_len)
shift_returns_var = pd.ewmvar(shift_returns,span=filter_len)

CovSeq = pd.DataFrame()
for FirstStock in np.arange(NumStocks-1):
	for SecondStock in np.arange(FirstStock+1,NumStocks):
		ColumnTitle = StockList[FirstStock] + '-' + StockList[SecondStock]
		CovSeq[ColumnTitle] = pd.ewmcov(shift_returns[StockList[FirstStock]],shift_returns[StockList[SecondStock]],span=filter_len)

# Test CVXOPT code for a single day
date = '2013-10-31'
n = NumStocks+1
pbar = matrix(interest_rate,(1,n))
p2 = shift_returns_mean.ix[date]
p2 = matrix(p2)
Esempio n. 43
0
    quotes = fin.quotes_historical_yahoo(symbol, start, end)
    dates, open, close, high, low, volume = zip(*quotes)

    data = {"open": open, "close": close, "high": high, "low": low, "volume": volume}

    dates = Index([datetime.fromordinal(int(d)) for d in dates])
    return DataFrame(data, index=dates)


msft = getQuotes("MSFT", startDate, endDate)
aapl = getQuotes("AAPL", startDate, endDate)
goog = getQuotes("GOOG", startDate, endDate)
ibm = getQuotes("IBM", startDate, endDate)

px = DataFrame({"MSFT": msft["close"], "IBM": ibm["close"], "GOOG": goog["close"], "AAPL": aapl["close"]})
returns = px / px.shift(1) - 1
# Select dates

subIndex = ibm.index[(ibm["close"] > 95) & (ibm["close"] < 100)]
msftOnSameDates = msft.reindex(subIndex)

# Insert columns

msft["hi-lo spread"] = msft["high"] - msft["low"]
ibm["hi-lo spread"] = ibm["high"] - ibm["low"]

# Aggregate monthly


def toMonthly(frame, how):
    offset = BMonthEnd()
Esempio n. 44
0
    def test_shift_empty(self):
        # Regression test for #8019
        df = DataFrame({'foo': []})
        rs = df.shift(-1)

        assert_frame_equal(df, rs)
from pandas import DataFrame
from pandas import concat
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error

# read data from csv_file
series = read_csv('../../static/data_set.csv',
                  nrows=2000,
                  header=0,
                  parse_dates=[0],
                  index_col=0,
                  squeeze=True)

# Create lagged dataset
values = DataFrame(series.values)
dataframe = concat([values.shift(1), values], axis=1)
dataframe.columns = ['t-1', 't+1']
print(dataframe.head(5))

# split into train and test sets
X = dataframe.values
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]
train_X, train_y = train[:, 0], train[:, 1]
test_X, test_y = test[:, 0], test[:, 1]


# persistence model
def model_persistence(x):
    return x
Esempio n. 46
0
def estimateBeta(priceY,priceX,algo = 'standard'):
    '''
    estimate stock Y vs stock X beta using iterative linear
    regression. Outliers outside 3 sigma boundary are filtered out

    Parameters
    --------
    priceX : price series of x (usually market)
    priceY : price series of y (estimate beta of this price)

    Returns
    --------
    beta : stockY beta relative to stock X
    '''

    X = DataFrame({'x':priceX,'y':priceY})

    if algo=='returns':
        ret = (X/X.shift(1)-1).dropna().values

        #print len(ret)
        
        x = ret[:,0]
        y = ret[:,1]
        
        # filter high values
        low = np.percentile(x,20)
        high = np.percentile(x,80)
        iValid = (x>low) & (x<high)
        
        x = x[iValid]
        y = y[iValid]
        

        iteration = 1
        nrOutliers = 1
        while iteration < 10 and nrOutliers > 0 :
            (a,b) = polyfit(x,y,1)
            yf = polyval([a,b],x)
            #plot(x,y,'x',x,yf,'r-')
            err = yf-y
            idxOutlier = abs(err) > 3*np.std(err)
            nrOutliers =sum(idxOutlier)
            beta = a
            #print 'Iteration: %i beta: %.2f outliers: %i' % (iteration,beta, nrOutliers)
            x = x[~idxOutlier]
            y = y[~idxOutlier]
            iteration += 1

    elif algo=='log':
        x = np.log(X['x'])
        y = np.log(X['y'])
        (a,b) = polyfit(x,y,1)
        beta = a

    elif algo=='standard':
        ret =np.log(X).diff().dropna()
        beta = ret['x'].cov(ret['y'])/ret['x'].var()



    else:
        raise TypeError("unknown algorithm type, use 'standard', 'log' or 'returns'")

    return beta
def clean_pw_offday(pw_offday, weeklookup, pw_slp2):
    '''
    Clean pw_offday query without filtering out non-off-days
    invoice-level => day level => customer level
    '''
    print('*'*100)
    print('Cleaning pw_offday query and creating summaries.')    
    print('*'*100)
    deliveries = pw_offday
    
    print('\n\n\nDeclaring functions for later use.')
    def as400_date(dat):
        '''Accepts date as formatted in AS400'''
        dat = str(dat)
        dat = dat[-6:]
        dat = dt.date(dt.strptime(dat, '%y%m%d'))
        return dat
        
    def sum_digits_in_string(digit):
        return sum(int(x) for x in digit if x.isdigit())
        
    print('Mapping Columns.')
    deliveries.rename(columns={'#MIVDT':'Date', '#MDIV#':'Division', '#MIVND':'Invoice', 
                       '#MCUS#':'CustomerId', '#MCALL':'Call', '#MPRIO':'Priority', 
                       '#MCMP':'Warehouse', 'CASES':'Cases', '#MEXT$':'Dollars', 
                       'CSHP':'Ship', '#MSLSP':'SalespersonId', 
                       'CADMBR':'ShipWeekPlan', 'CUDSCC':'Merchandising', 'CONPRM':'OnPremise', 
                       'CSTDTE':'CustomerSetup', '#MCUSY':'CustomerType', 'CCUSTN':'Customer'}, inplace=True)
    pw_slp2.rename(columns={'S2NUM#':'SalespersonId', 'S2NAME':'Salesperson', 'S2DIVR':'SalespersonDirector'}, 
                   inplace=True)
                   
    deliveries = deliveries.merge(pw_slp2, on='SalespersonId', how='left')
    
    print('Mapping Customer types.')
    typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\
                'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\
                'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\
                'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\
                'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\
                '7':'Sports Venue'}
    deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map)    
    
    print('Mapping Warehouse names.')
    whs_map = {1:'Kansas City',2:'Saint Louis',3:'Columbia',4:'Cape Girardeau', 5:'Springfield'}
    deliveries.Warehouse = deliveries.Warehouse.map(whs_map)          
    
    print('Processing dates.')
    deliveries.Date = [as400_date(d) for d in deliveries.Date.astype(str).tolist()]    
    weeklookup['Date'] = [dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist()]
    
    print('Merging on dates with week lookup.')
    deliveries = deliveries.merge(weeklookup, on='Date')
    
    dat = Series(deliveries.Date.tolist())
    deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat])
    
    week_plan = deliveries.ShipWeekPlan.tolist()
    week_shipped = deliveries.ShipWeek.tolist()
    
    print('Using custom logic to derive which days were off-day deliveries.')
    deliveries.Ship = del_days = [str('%07d'% int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist()]
    
    mon = Series([d[-7:][:1] for d in del_days]).map({'1':'M','0':'_'})
    tue = Series([d[-6:][:1] for d in del_days]).map({'1':'T','0':'_'})
    wed = Series([d[-5:][:1] for d in del_days]).map({'1':'W','0':'_'})
    thu = Series([d[-4:][:1] for d in del_days]).map({'1':'R','0':'_'})
    fri = Series([d[-3:][:1] for d in del_days]).map({'1':'F','0':'_'})
    sat = Series([d[-2:][:1] for d in del_days]).map({'1':'S','0':'_'})
    sun = Series([d[-1:][:1] for d in del_days]).map({'1':'U','0':'_'})
    
    deliveries['DeliveryDays'] = del_days = list(itertools.chain.from_iterable([mon + tue + wed + thu + fri + sat + sun]))
    
    weekday = deliveries.Weekday = [d[:3] for d in deliveries.Weekday.astype(str).tolist()]
    _days = DataFrame(data={'Weekday':weekday, 'WeekPlanned':week_plan, 'WeekShipped':week_shipped, 'DelDays':del_days}) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun,
    day_list = _days['WeekPlanned'].tolist()
    _days['WeekPlanned'] = [d if d in ['A','B'] else '' for d in day_list]
    
    _week_actual = _days.WeekShipped.tolist()
    _week_plan = _days['WeekPlanned'] = [ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual,_days.WeekPlanned.tolist())]
    _days['OffWeek'] = _off_week = [p != a for p, a in zip(_week_plan, _week_actual)]
    
    off_mon = [str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday)]
    off_tue = [str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday)]
    off_wed = [str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday)]
    off_thu = [str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday)]
    off_fri = [str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday)]
    off_sat = [str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday)]
    off_sun = [str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday)]
    
    _off_days = DataFrame({'Mon':off_mon, 'Tue':off_tue, 'Wed':off_wed, 'Thu':off_thu, 
                           'Fri':off_fri, 'Sat':off_sat, 'Sun':off_sun, 'OffWeek':_off_week, 'Weekday':weekday})
    _off_days = _off_days[['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Weekday','OffWeek']]                           
    _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (_off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (_off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (_off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True)                
       
    print('Check here if you suspect a bug.')                    
    #check_later = _off_days[_off_days['OffDayDelivery'] == True]
    
    print('Mapping Call Codes.')
    deliveries = pd.concat([deliveries,_off_days[['OffWeek','OffDayDelivery']]], axis=1)
    deliveries.Call = deliveries.Call.map({1:'Customer Call', 2:'ROE/EDI', 3:'Salesperson Call', 4:'Telesales'})
    
    print('Putting Setup Date into proper date format.')
    setup_date = deliveries.CustomerSetup.astype(str).tolist()
    setup_month = Series([d.zfill(4)[:2] for d in setup_date])
    setup_year = Series(["20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date]) #this_century = [int(d[-2:]) < 20 for d in setup_date]
    
    deliveries['CustomerSetup'] = c_setup = [str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)]
    
    print('Defining new customers based on whether they were setup last month or not.')
    if dt.now().month == 1:
        last_month = '12'
    else:
        last_month = str(dt.now().month - 1).zfill(2)
    if dt.now().month == 1:
            this_year = str(dt.now().year - 1)
    else:
        this_year = str(dt.now().year)
    m_y_cutoff = last_month + '-' + this_year
        
    deliveries['NewCustomer'] = [1 if m_y_cutoff == setup else 0 for setup in c_setup]
    deliveries['OffDayDeliveries'] =  deliveries.OffDayDelivery.astype(int)
    
    print('Deriving number of weekly deliveries allotted to each customer.')
    _n_days = deliveries.Ship.astype(str).tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [sum_digits_in_string(n) for n in _n_days]
    _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist()
    _week_ind = deliveries['ShipWeekPlan'].tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [a if w not in ['A','B'] else 0.5 for a, w in zip(_allot, _week_ind)]
    _n_days = deliveries.set_index('CustomerId')['AllottedWeeklyDeliveryDays'].to_dict()
    
    print('\n')
    print('-'*100)    
    print('\n')    
    
    print('Aggregating by Day.')
    agg_funcs_day = {'OffDayDeliveries' : {'Count':max}, 
                 'Date' : {'Count':len_unique},
                 'Cases' : {'Sum':sum, 'Avg':np.mean},
                 'Dollars' : {'Sum':sum, 'Avg':np.mean},
                 'NewCustomer': lambda x: min(x)}
    
    pass_through_cols = ['CustomerId','Customer','Week','Date']
    _agg_byday = DataFrame(deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(drop=False)
    _agg_byday = DataFrame(_agg_byday[['CustomerId','Customer','Week','Date','OffDayDeliveries','NewCustomer','Cases','Dollars']])
    _agg_byday.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns]
    _agg_byday.columns = ['CustomerId','Customer','Week','Date','Delivery','OffDayDelivery','NewCustomer','Cases|Sum','Cases|Avg','Dollars|Sum','Dollars|Avg']
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['CustomerId'].astype(int)
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['AllottedWeeklyDeliveryDays|Count'].map(_n_days)
    
    
    
    print('Aggregating by Week.')
    agg_funcs_week = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x)}
    
    _agg_byweek = DataFrame(_agg_byday.groupby(['CustomerId','Week']).agg(agg_funcs_week)).reset_index(drop=False)
    _agg_byweek.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns]
    
    print('Mapping number of deliveries to Customers.')
    # Map number of total deliveries each week by customer
    # to determine whether a customer with TWR deliveries 
    # got TWF deliveries -- which is an off-day delivery
    # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total}
    _c = _agg_byweek['CustomerId'].astype(str).tolist()
    _w = _agg_byweek['Week'].astype(str).tolist()
    _agg_byweek['_X'] = [c + ',' + w for c,w in zip(_c,_w)]
    by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict()
    
    cid = _agg_byday['CustomerId'].astype(str).tolist()
    wkk = _agg_byday['Week'].astype(str).tolist()
    _agg_byday['N_DeliveriesThisWeek'] = [c + ',' + w for c, w in zip(cid, wkk)]
    _agg_byday['N_DeliveriesThisWeek'] = _agg_byday['N_DeliveriesThisWeek'].map(Series(by_week_map))
    
    
    print('Using custom logic to define Additional Delivery Days.')
    addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'] )
    addl_day_criteria_2 = ( _agg_byday.shift(1)['Week'] == _agg_byday['Week'] )
    addl_day_criteria_3 = ( _agg_byday['OffDayDelivery'] == 1 )
    addl_day_criteria_4 = ( _agg_byday['NewCustomer'] != 1 )
    addl_day_criteria_5 = ( _agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count'] )
    
    _agg_byday['AdditionalDeliveryDays'] = Series(addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int)
    
    
    print('Aggregating by Customer.')    
    agg_funcs_cust = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x),
                      'AllottedWeeklyDeliveryDays|Count': lambda x: max(x),
                      'AdditionalDeliveryDays': lambda x: sum(x),
                      'Dollars|Sum':lambda x: int(sum(x)),
                      'Cases|Sum':lambda x: sum(x) }                                           
    
    _agg_bycust = DataFrame(_agg_byday.groupby(['CustomerId','Customer']).agg(agg_funcs_cust)).reset_index(drop=False)
    _agg_bycust.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns]
    _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1)
    
    _agg_bycust.columns = ['AdditionalDeliveries','AllottedDeliveryDays','Cases',
                           'Customer','CustomerId','Deliveries','Dollars',
                           'NewCustomer','OffDayDeliveries']
    _agg_bycust = _agg_bycust[['CustomerId','Customer','NewCustomer','AllottedDeliveryDays','Deliveries',
                               'OffDayDeliveries','AdditionalDeliveries','Cases','Dollars']]
    
    
    print('Mapping useful Customer attributes.')
    attr = ['CustomerId','Warehouse','OnPremise','CustomerSetup','CustomerType','ShipWeekPlan','DeliveryDays']
    customer_attributes = deliveries[attr].drop_duplicates().reset_index(drop=True)
    
    _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates()
    _agg_bycust = _agg_bycust.sort_values(by=['AdditionalDeliveries','OffDayDeliveries'], ascending=False).reset_index(drop=True)
    
    _agg_bycust['CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries']
    _agg_bycust['DollarsPerDelivery'] = round(_agg_bycust['Dollars'] / _agg_bycust['Deliveries'],2)
    
    _agg_bycust['OffDayDeliveries/Deliveries'] = round(_agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'],2)
    _agg_bycust['AdditionalDeliveries/Deliveries'] = round(_agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'],2)
    
    
    print('Mapping Tiers based on allotted delivery days.')
    tier_map = {0:'No Delivery Days Assigned',0.5:'Tier 4', 1:'Tier 3', 2:'Tier 2', 3:'Tier 1', 4:'Tier 1', 5:'Tier 1', 6:'Tier 1', 7:'Tier 1'}
    _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map)
    
    addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist()
    tier = _agg_bycust['Tier'].tolist()
    
    _agg_bycust['AdditionalDeliveries'] = [addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier)]
    
    _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(np.nan, '')
    
    
    print('Creating Overall Summary.')
    agg_funcs_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    overall_summary = DataFrame(_agg_bycust.groupby(['Tier','Warehouse']).agg(agg_funcs_summary))
    overall_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns]
    overall_summary = overall_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    overall_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
    
    print('Creating High-Level Summary.\n\n\n')
    agg_funcs_HL_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    high_level_summary = DataFrame(_agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary))
    high_level_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns]
    high_level_summary = high_level_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    high_level_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
                                       
    print('*'*100)
    print('Finished creating summaries at high level, overall, and aggregating by customer and by day.')
    print('*'*100)    

    return high_level_summary, overall_summary, _agg_bycust, _agg_byday, deliveries