Example #1
0
class Shift(object):
    # frame shift speedup issue-5609
    params = [0, 1]
    param_names = ['axis']

    def setup(self, axis):
        self.df = DataFrame(np.random.rand(10000, 500))

    def time_shift(self, axis):
        self.df.shift(1, axis=axis)
Example #2
0
    def test_shift_fill_value(self):
        # GH #24128
        df = DataFrame([1, 2, 3, 4, 5],
                       index=date_range('1/1/2000', periods=5, freq='H'))
        exp = DataFrame([0, 1, 2, 3, 4],
                        index=date_range('1/1/2000', periods=5, freq='H'))
        result = df.shift(1, fill_value=0)
        assert_frame_equal(result, exp)

        exp = DataFrame([0, 0, 1, 2, 3],
                        index=date_range('1/1/2000', periods=5, freq='H'))
        result = df.shift(2, fill_value=0)
        assert_frame_equal(result, exp)
Example #3
0
def main():
    for ticker in ['AAPL','IBM','MSFT','GOOG']:
        all_data[ticker]=web.get_data_yahoo(ticker,'1/3/200','12/31/2009')
        
    price=DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume=DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    returns=(price-price.shift(1))/price
Example #4
0
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = concat(columns, axis=1)
    df.fillna(0, inplace=True)
    return df
Example #5
0
 def test_shift_categorical(self):
     # GH 9416
     s1 = pd.Series(['a', 'b', 'c'], dtype='category')
     s2 = pd.Series(['A', 'B', 'C'], dtype='category')
     df = DataFrame({'one': s1, 'two': s2})
     rs = df.shift(1)
     xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
     assert_frame_equal(rs, xp)
Example #6
0
 def test_shift_bool(self):
     df = DataFrame({'high': [True, False],
                     'low': [False, False]})
     rs = df.shift(1)
     xp = DataFrame(np.array([[np.nan, np.nan],
                              [True, False]], dtype=object),
                    columns=['high', 'low'])
     assert_frame_equal(rs, xp)
Example #7
0
 def test_shift_categorical(self):
     # GH 9416
     s1 = pd.Series(["a", "b", "c"], dtype="category")
     s2 = pd.Series(["A", "B", "C"], dtype="category")
     df = DataFrame({"one": s1, "two": s2})
     rs = df.shift(1)
     xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)})
     assert_frame_equal(rs, xp)
def estimateBeta(priceY, priceX, algo="standard"):
    """
    estimate stock Y vs stock X beta using iterative linear
    regression. Outliers outside 3 sigma boundary are filtered out
    
    Parameters
    --------
    priceX : price series of x (usually market)
    priceY : price series of y (estimate beta of this price)
    
    Returns
    --------
    beta : stockY beta relative to stock X
    """

    X = DataFrame({"x": priceX, "y": priceY})

    if algo == "returns":
        ret = (X / X.shift(1) - 1).dropna().values

        # print len(ret)

        x = ret[:, 0]
        y = ret[:, 1]

        iteration = 1
        nrOutliers = 1
        while iteration < 10 and nrOutliers > 0:
            (a, b) = polyfit(x, y, 1)
            yf = polyval([a, b], x)
            # plot(x,y,'x',x,yf,'r-')
            err = yf - y
            idxOutlier = abs(err) > 3 * np.std(err)
            nrOutliers = sum(idxOutlier)
            beta = a
            # print 'Iteration: %i beta: %.2f outliers: %i' % (iteration,beta, nrOutliers)
            x = x[~idxOutlier]
            y = y[~idxOutlier]
            iteration += 1

    elif algo == "log":
        x = np.log(X["x"])
        y = np.log(X["y"])
        (a, b) = polyfit(x, y, 1)
        beta = a

    elif algo == "standard":
        ret = np.log(X).diff().dropna()
        beta = ret["x"].cov(ret["y"]) / ret["x"].var()

    else:
        raise TypeError("unknown algorithm type, use 'standard', 'log' or 'returns'")

    return beta
Example #9
0
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
        n_vars = 1 if type(data) is list else data.shape[1]
        df = DataFrame(data)
        cols, names = list(), list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
                cols.append(df.shift(i))
                names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, n_out):
                cols.append(df.shift(-i))
                if i == 0:
                        names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
                else:
                        names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
        # put it all together
        agg = concat(cols, axis=1)
        agg.columns = names
        # drop rows with NaN values
        if dropnan:
                agg.dropna(inplace=True)
        return agg
Example #10
0
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	targets = df.shift(0)
	drop_list = [i for i in range(n_vars -1)]
	targets.drop(targets.columns[drop_list], axis=1, inplace=True)
	cols.append(targets)
	names += ['var%d' % (n_out -1)]
	#print(names)
	#print(cols)
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg
Example #11
0
def estimateBeta(priceY,priceX,algo = 'standard'):
    '''
    estimate stock Y vs stock X beta using iterative linear
    regression. Outliers outside 3 sigma boundary are filtered out

    Parameters
    --------
    priceX : price series of x (usually market)
    priceY : price series of y (estimate beta of this price)

    Returns
    --------
    beta : stockY beta relative to stock X
    '''

    X = DataFrame({'x':priceX,'y':priceY})

    if algo=='returns':
        ret = (X/X.shift(1)-1).dropna().values

        #print len(ret)
        
        x = ret[:,0]
        y = ret[:,1]
        
        # filter high values
        low = np.percentile(x,20)
        high = np.percentile(x,80)
        iValid = (x>low) & (x<high)
        
        x = x[iValid]
        y = y[iValid]
        

        iteration = 1
        nrOutliers = 1
        while iteration < 10 and nrOutliers > 0 :
            (a,b) = polyfit(x,y,1)
            yf = polyval([a,b],x)
            #plot(x,y,'x',x,yf,'r-')
            err = yf-y
            idxOutlier = abs(err) > 3*np.std(err)
            nrOutliers =sum(idxOutlier)
            beta = a
            #print 'Iteration: %i beta: %.2f outliers: %i' % (iteration,beta, nrOutliers)
            x = x[~idxOutlier]
            y = y[~idxOutlier]
            iteration += 1

    elif algo=='log':
        x = np.log(X['x'])
        y = np.log(X['y'])
        (a,b) = polyfit(x,y,1)
        beta = a

    elif algo=='standard':
        ret =np.log(X).diff().dropna()
        beta = ret['x'].cov(ret['y'])/ret['x'].var()



    else:
        raise TypeError("unknown algorithm type, use 'standard', 'log' or 'returns'")

    return beta
Example #12
0
    def test_shift_empty(self):
        # Regression test for #8019
        df = DataFrame({'foo': []})
        rs = df.shift(-1)

        assert_frame_equal(df, rs)
Example #13
0
    def test_shift(self):
        # naive shift
        shiftedFrame = self.tsframe.shift(5)
        self.assert_index_equal(shiftedFrame.index, self.tsframe.index)

        shiftedSeries = self.tsframe['A'].shift(5)
        assert_series_equal(shiftedFrame['A'], shiftedSeries)

        shiftedFrame = self.tsframe.shift(-5)
        self.assert_index_equal(shiftedFrame.index, self.tsframe.index)

        shiftedSeries = self.tsframe['A'].shift(-5)
        assert_series_equal(shiftedFrame['A'], shiftedSeries)

        # shift by 0
        unshifted = self.tsframe.shift(0)
        assert_frame_equal(unshifted, self.tsframe)

        # shift by DateOffset
        shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay())
        self.assertEqual(len(shiftedFrame), len(self.tsframe))

        shiftedFrame2 = self.tsframe.shift(5, freq='B')
        assert_frame_equal(shiftedFrame, shiftedFrame2)

        d = self.tsframe.index[0]
        shifted_d = d + offsets.BDay(5)
        assert_series_equal(self.tsframe.xs(d),
                            shiftedFrame.xs(shifted_d), check_names=False)

        # shift int frame
        int_shifted = self.intframe.shift(1)  # noqa

        # Shifting with PeriodIndex
        ps = tm.makePeriodFrame()
        shifted = ps.shift(1)
        unshifted = shifted.shift(-1)
        self.assert_index_equal(shifted.index, ps.index)
        self.assert_index_equal(unshifted.index, ps.index)
        tm.assert_numpy_array_equal(unshifted.iloc[:, 0].valid().values,
                                    ps.iloc[:-1, 0].values)

        shifted2 = ps.shift(1, 'B')
        shifted3 = ps.shift(1, offsets.BDay())
        assert_frame_equal(shifted2, shifted3)
        assert_frame_equal(ps, shifted2.shift(-1, 'B'))

        assertRaisesRegexp(ValueError, 'does not match PeriodIndex freq',
                           ps.shift, freq='D')

        # shift other axis
        # GH 6371
        df = DataFrame(np.random.rand(10, 5))
        expected = pd.concat([DataFrame(np.nan, index=df.index,
                                        columns=[0]),
                              df.iloc[:, 0:-1]],
                             ignore_index=True, axis=1)
        result = df.shift(1, axis=1)
        assert_frame_equal(result, expected)

        # shift named axis
        df = DataFrame(np.random.rand(10, 5))
        expected = pd.concat([DataFrame(np.nan, index=df.index,
                                        columns=[0]),
                              df.iloc[:, 0:-1]],
                             ignore_index=True, axis=1)
        result = df.shift(1, axis='columns')
        assert_frame_equal(result, expected)
def clean_pw_offday(pw_offday, weeklookup, pw_slp2):
    '''
    Clean pw_offday query without filtering out non-off-days
    invoice-level => day level => customer level
    '''
    print('*'*100)
    print('Cleaning pw_offday query and creating summaries.')    
    print('*'*100)
    deliveries = pw_offday
    
    print('\n\n\nDeclaring functions for later use.')
    def as400_date(dat):
        '''Accepts date as formatted in AS400'''
        dat = str(dat)
        dat = dat[-6:]
        dat = dt.date(dt.strptime(dat, '%y%m%d'))
        return dat
        
    def sum_digits_in_string(digit):
        return sum(int(x) for x in digit if x.isdigit())
        
    print('Mapping Columns.')
    deliveries.rename(columns={'#MIVDT':'Date', '#MDIV#':'Division', '#MIVND':'Invoice', 
                       '#MCUS#':'CustomerId', '#MCALL':'Call', '#MPRIO':'Priority', 
                       '#MCMP':'Warehouse', 'CASES':'Cases', '#MEXT$':'Dollars', 
                       'CSHP':'Ship', '#MSLSP':'SalespersonId', 
                       'CADMBR':'ShipWeekPlan', 'CUDSCC':'Merchandising', 'CONPRM':'OnPremise', 
                       'CSTDTE':'CustomerSetup', '#MCUSY':'CustomerType', 'CCUSTN':'Customer'}, inplace=True)
    pw_slp2.rename(columns={'S2NUM#':'SalespersonId', 'S2NAME':'Salesperson', 'S2DIVR':'SalespersonDirector'}, 
                   inplace=True)
                   
    deliveries = deliveries.merge(pw_slp2, on='SalespersonId', how='left')
    
    print('Mapping Customer types.')
    typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\
                'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\
                'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\
                'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\
                'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\
                '7':'Sports Venue'}
    deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map)    
    
    print('Mapping Warehouse names.')
    whs_map = {1:'Kansas City',2:'Saint Louis',3:'Columbia',4:'Cape Girardeau', 5:'Springfield'}
    deliveries.Warehouse = deliveries.Warehouse.map(whs_map)          
    
    print('Processing dates.')
    deliveries.Date = [as400_date(d) for d in deliveries.Date.astype(str).tolist()]    
    weeklookup['Date'] = [dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist()]
    
    print('Merging on dates with week lookup.')
    deliveries = deliveries.merge(weeklookup, on='Date')
    
    dat = Series(deliveries.Date.tolist())
    deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat])
    
    week_plan = deliveries.ShipWeekPlan.tolist()
    week_shipped = deliveries.ShipWeek.tolist()
    
    print('Using custom logic to derive which days were off-day deliveries.')
    deliveries.Ship = del_days = [str('%07d'% int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist()]
    
    mon = Series([d[-7:][:1] for d in del_days]).map({'1':'M','0':'_'})
    tue = Series([d[-6:][:1] for d in del_days]).map({'1':'T','0':'_'})
    wed = Series([d[-5:][:1] for d in del_days]).map({'1':'W','0':'_'})
    thu = Series([d[-4:][:1] for d in del_days]).map({'1':'R','0':'_'})
    fri = Series([d[-3:][:1] for d in del_days]).map({'1':'F','0':'_'})
    sat = Series([d[-2:][:1] for d in del_days]).map({'1':'S','0':'_'})
    sun = Series([d[-1:][:1] for d in del_days]).map({'1':'U','0':'_'})
    
    deliveries['DeliveryDays'] = del_days = list(itertools.chain.from_iterable([mon + tue + wed + thu + fri + sat + sun]))
    
    weekday = deliveries.Weekday = [d[:3] for d in deliveries.Weekday.astype(str).tolist()]
    _days = DataFrame(data={'Weekday':weekday, 'WeekPlanned':week_plan, 'WeekShipped':week_shipped, 'DelDays':del_days}) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun,
    day_list = _days['WeekPlanned'].tolist()
    _days['WeekPlanned'] = [d if d in ['A','B'] else '' for d in day_list]
    
    _week_actual = _days.WeekShipped.tolist()
    _week_plan = _days['WeekPlanned'] = [ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual,_days.WeekPlanned.tolist())]
    _days['OffWeek'] = _off_week = [p != a for p, a in zip(_week_plan, _week_actual)]
    
    off_mon = [str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday)]
    off_tue = [str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday)]
    off_wed = [str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday)]
    off_thu = [str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday)]
    off_fri = [str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday)]
    off_sat = [str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday)]
    off_sun = [str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday)]
    
    _off_days = DataFrame({'Mon':off_mon, 'Tue':off_tue, 'Wed':off_wed, 'Thu':off_thu, 
                           'Fri':off_fri, 'Sat':off_sat, 'Sun':off_sun, 'OffWeek':_off_week, 'Weekday':weekday})
    _off_days = _off_days[['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Weekday','OffWeek']]                           
    _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (_off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (_off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (_off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True)                
       
    print('Check here if you suspect a bug.')                    
    #check_later = _off_days[_off_days['OffDayDelivery'] == True]
    
    print('Mapping Call Codes.')
    deliveries = pd.concat([deliveries,_off_days[['OffWeek','OffDayDelivery']]], axis=1)
    deliveries.Call = deliveries.Call.map({1:'Customer Call', 2:'ROE/EDI', 3:'Salesperson Call', 4:'Telesales'})
    
    print('Putting Setup Date into proper date format.')
    setup_date = deliveries.CustomerSetup.astype(str).tolist()
    setup_month = Series([d.zfill(4)[:2] for d in setup_date])
    setup_year = Series(["20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date]) #this_century = [int(d[-2:]) < 20 for d in setup_date]
    
    deliveries['CustomerSetup'] = c_setup = [str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)]
    
    print('Defining new customers based on whether they were setup last month or not.')
    if dt.now().month == 1:
        last_month = '12'
    else:
        last_month = str(dt.now().month - 1).zfill(2)
    if dt.now().month == 1:
            this_year = str(dt.now().year - 1)
    else:
        this_year = str(dt.now().year)
    m_y_cutoff = last_month + '-' + this_year
        
    deliveries['NewCustomer'] = [1 if m_y_cutoff == setup else 0 for setup in c_setup]
    deliveries['OffDayDeliveries'] =  deliveries.OffDayDelivery.astype(int)
    
    print('Deriving number of weekly deliveries allotted to each customer.')
    _n_days = deliveries.Ship.astype(str).tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [sum_digits_in_string(n) for n in _n_days]
    _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist()
    _week_ind = deliveries['ShipWeekPlan'].tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [a if w not in ['A','B'] else 0.5 for a, w in zip(_allot, _week_ind)]
    _n_days = deliveries.set_index('CustomerId')['AllottedWeeklyDeliveryDays'].to_dict()
    
    print('\n')
    print('-'*100)    
    print('\n')    
    
    print('Aggregating by Day.')
    agg_funcs_day = {'OffDayDeliveries' : {'Count':max}, 
                 'Date' : {'Count':len_unique},
                 'Cases' : {'Sum':sum, 'Avg':np.mean},
                 'Dollars' : {'Sum':sum, 'Avg':np.mean},
                 'NewCustomer': lambda x: min(x)}
    
    pass_through_cols = ['CustomerId','Customer','Week','Date']
    _agg_byday = DataFrame(deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(drop=False)
    _agg_byday = DataFrame(_agg_byday[['CustomerId','Customer','Week','Date','OffDayDeliveries','NewCustomer','Cases','Dollars']])
    _agg_byday.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns]
    _agg_byday.columns = ['CustomerId','Customer','Week','Date','Delivery','OffDayDelivery','NewCustomer','Cases|Sum','Cases|Avg','Dollars|Sum','Dollars|Avg']
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['CustomerId'].astype(int)
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['AllottedWeeklyDeliveryDays|Count'].map(_n_days)
    
    
    
    print('Aggregating by Week.')
    agg_funcs_week = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x)}
    
    _agg_byweek = DataFrame(_agg_byday.groupby(['CustomerId','Week']).agg(agg_funcs_week)).reset_index(drop=False)
    _agg_byweek.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns]
    
    print('Mapping number of deliveries to Customers.')
    # Map number of total deliveries each week by customer
    # to determine whether a customer with TWR deliveries 
    # got TWF deliveries -- which is an off-day delivery
    # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total}
    _c = _agg_byweek['CustomerId'].astype(str).tolist()
    _w = _agg_byweek['Week'].astype(str).tolist()
    _agg_byweek['_X'] = [c + ',' + w for c,w in zip(_c,_w)]
    by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict()
    
    cid = _agg_byday['CustomerId'].astype(str).tolist()
    wkk = _agg_byday['Week'].astype(str).tolist()
    _agg_byday['N_DeliveriesThisWeek'] = [c + ',' + w for c, w in zip(cid, wkk)]
    _agg_byday['N_DeliveriesThisWeek'] = _agg_byday['N_DeliveriesThisWeek'].map(Series(by_week_map))
    
    
    print('Using custom logic to define Additional Delivery Days.')
    addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'] )
    addl_day_criteria_2 = ( _agg_byday.shift(1)['Week'] == _agg_byday['Week'] )
    addl_day_criteria_3 = ( _agg_byday['OffDayDelivery'] == 1 )
    addl_day_criteria_4 = ( _agg_byday['NewCustomer'] != 1 )
    addl_day_criteria_5 = ( _agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count'] )
    
    _agg_byday['AdditionalDeliveryDays'] = Series(addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int)
    
    
    print('Aggregating by Customer.')    
    agg_funcs_cust = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x),
                      'AllottedWeeklyDeliveryDays|Count': lambda x: max(x),
                      'AdditionalDeliveryDays': lambda x: sum(x),
                      'Dollars|Sum':lambda x: int(sum(x)),
                      'Cases|Sum':lambda x: sum(x) }                                           
    
    _agg_bycust = DataFrame(_agg_byday.groupby(['CustomerId','Customer']).agg(agg_funcs_cust)).reset_index(drop=False)
    _agg_bycust.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns]
    _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1)
    
    _agg_bycust.columns = ['AdditionalDeliveries','AllottedDeliveryDays','Cases',
                           'Customer','CustomerId','Deliveries','Dollars',
                           'NewCustomer','OffDayDeliveries']
    _agg_bycust = _agg_bycust[['CustomerId','Customer','NewCustomer','AllottedDeliveryDays','Deliveries',
                               'OffDayDeliveries','AdditionalDeliveries','Cases','Dollars']]
    
    
    print('Mapping useful Customer attributes.')
    attr = ['CustomerId','Warehouse','OnPremise','CustomerSetup','CustomerType','ShipWeekPlan','DeliveryDays']
    customer_attributes = deliveries[attr].drop_duplicates().reset_index(drop=True)
    
    _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates()
    _agg_bycust = _agg_bycust.sort_values(by=['AdditionalDeliveries','OffDayDeliveries'], ascending=False).reset_index(drop=True)
    
    _agg_bycust['CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries']
    _agg_bycust['DollarsPerDelivery'] = round(_agg_bycust['Dollars'] / _agg_bycust['Deliveries'],2)
    
    _agg_bycust['OffDayDeliveries/Deliveries'] = round(_agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'],2)
    _agg_bycust['AdditionalDeliveries/Deliveries'] = round(_agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'],2)
    
    
    print('Mapping Tiers based on allotted delivery days.')
    tier_map = {0:'No Delivery Days Assigned',0.5:'Tier 4', 1:'Tier 3', 2:'Tier 2', 3:'Tier 1', 4:'Tier 1', 5:'Tier 1', 6:'Tier 1', 7:'Tier 1'}
    _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map)
    
    addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist()
    tier = _agg_bycust['Tier'].tolist()
    
    _agg_bycust['AdditionalDeliveries'] = [addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier)]
    
    _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(np.nan, '')
    
    
    print('Creating Overall Summary.')
    agg_funcs_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    overall_summary = DataFrame(_agg_bycust.groupby(['Tier','Warehouse']).agg(agg_funcs_summary))
    overall_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns]
    overall_summary = overall_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    overall_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
    
    print('Creating High-Level Summary.\n\n\n')
    agg_funcs_HL_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    high_level_summary = DataFrame(_agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary))
    high_level_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns]
    high_level_summary = high_level_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    high_level_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
                                       
    print('*'*100)
    print('Finished creating summaries at high level, overall, and aggregating by customer and by day.')
    print('*'*100)    

    return high_level_summary, overall_summary, _agg_bycust, _agg_byday, deliveries
Example #15
0
    quotes = fin.quotes_historical_yahoo(symbol, start, end)
    dates, open, close, high, low, volume = zip(*quotes)

    data = {"open": open, "close": close, "high": high, "low": low, "volume": volume}

    dates = Index([datetime.fromordinal(int(d)) for d in dates])
    return DataFrame(data, index=dates)


msft = getQuotes("MSFT", startDate, endDate)
aapl = getQuotes("AAPL", startDate, endDate)
goog = getQuotes("GOOG", startDate, endDate)
ibm = getQuotes("IBM", startDate, endDate)

px = DataFrame({"MSFT": msft["close"], "IBM": ibm["close"], "GOOG": goog["close"], "AAPL": aapl["close"]})
returns = px / px.shift(1) - 1
# Select dates

subIndex = ibm.index[(ibm["close"] > 95) & (ibm["close"] < 100)]
msftOnSameDates = msft.reindex(subIndex)

# Insert columns

msft["hi-lo spread"] = msft["high"] - msft["low"]
ibm["hi-lo spread"] = ibm["high"] - ibm["low"]

# Aggregate monthly


def toMonthly(frame, how):
    offset = BMonthEnd()
from pandas import DataFrame
from pandas import concat
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error

# read data from csv_file
series = read_csv('../../static/data_set.csv',
                  nrows=2000,
                  header=0,
                  parse_dates=[0],
                  index_col=0,
                  squeeze=True)

# Create lagged dataset
values = DataFrame(series.values)
dataframe = concat([values.shift(1), values], axis=1)
dataframe.columns = ['t-1', 't+1']
print(dataframe.head(5))

# split into train and test sets
X = dataframe.values
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]
train_X, train_y = train[:, 0], train[:, 1]
test_X, test_y = test[:, 0], test[:, 1]


# persistence model
def model_persistence(x):
    return x
	volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})

	price.to_csv(filename)

else:
	price = pd.read_csv(filename)
	price.index = [datetime.strptime(x,'%Y-%m-%d') for x in price['Date']]
	price = price.drop('Date',1)


# Specify number of days to shift
shift = 20
# Specify filter "length"
filter_len = shift

shift_returns = price/price.shift(shift) - 1
shift_returns_mean = pd.ewma(shift_returns,span=filter_len)
shift_returns_var = pd.ewmvar(shift_returns,span=filter_len)

CovSeq = pd.DataFrame()
for FirstStock in np.arange(NumStocks-1):
	for SecondStock in np.arange(FirstStock+1,NumStocks):
		ColumnTitle = StockList[FirstStock] + '-' + StockList[SecondStock]
		CovSeq[ColumnTitle] = pd.ewmcov(shift_returns[StockList[FirstStock]],shift_returns[StockList[SecondStock]],span=filter_len)

# Test CVXOPT code for a single day
date = '2013-10-31'
n = NumStocks+1
pbar = matrix(interest_rate,(1,n))
p2 = shift_returns_mean.ix[date]
p2 = matrix(p2)