Example #1
0
 def _prepare_observations(cls, feature_extractor, text_classes, assessments):
     texts, classes = zip(*text_classes)
     
     features_lists = {fn:[] for fn in feature_extractor.FEATURES}
     
     for text in texts:
         
         features = feature_extractor.extract(text)
         assert features_lists.keys() == features.keys()
         
         for feature_name, value in features.items():
             features_lists[feature_name].append(value)
         
     
     features_pd = DataFrame(features_lists)
     # ensure proper column order
     features_pd.reindex_axis(feature_extractor.FEATURES, axis=1)
     
     # Convert classes to a categorical
     ass_map = {c:i for i, c in enumerate(assessments)} # Int map
     ass_int = [ass_map[c] for c in classes] # Convert to int
     classes_pd = DataFrame(
             {'class': Categorical(ass_int, levels=assessments)},
             index = list(range(1, len(ass_int)+1))) #
     
     return features_pd, classes_pd
Example #2
0
def parse_splits(fname_or_df, outname=None):
    """From csv with columns name, break, yield a df with start, end and length.

    Parameters:
        fname_or_df:    filename. If df is given, recalculate length.

    Returns: df with name, start, end, length. Units are in milliseconds.
    """
    if type(fname_or_df) == str: log = DataFrame.from_csv(fname_or_df, index_col=None)
    else: log = fname_or_df
    
    if 'break' in log:
        df = DataFrame({'name':  log['name'][0:-1].values,
                        'start': log['break'][0:-1].values,
                        'end':   log['break'][1:].values,
                        'order': range(len(log) - 1)})
    elif 'start' in log: df = log
    else: raise Exception('needs to either have break, or start and end columns')

    # if splits are given in time, convert to millisecond
    if ':' in str(df['start'][0]): 
        for col in ['start', 'end']: df[col] = df[col].apply(time2ms)

    # calculate length
    df['length'] = df['end'] - df['start']
    assert all(col in df for col in OUTCOLS)

    if outname:
        df.to_csv(outname, index=None)
    
    return df.reindex_axis(OUTCOLS, axis=1)
Example #3
0
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(lrange(15))
        assert np.isnan(result.values[-5:]).all()

        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=lrange(5), fill_value=0.)
        expected = df.copy()
        expected[4] = 0.
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value='foo')
        expected = df.copy()
        expected[4] = 'foo'
        assert_frame_equal(result, expected)

        # reindex_axis
        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex_axis(lrange(15), fill_value=0., axis=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex_axis(lrange(5), fill_value=0., axis=1)
        expected = df.reindex(columns=lrange(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df['foo'] = 'foo'
        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(lrange(15))
        self.assertTrue(np.isnan(result.values[-5:]).all())

        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=lrange(5), fill_value=0.0)
        expected = df.copy()
        expected[4] = 0.0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value="foo")
        expected = df.copy()
        expected[4] = "foo"
        assert_frame_equal(result, expected)

        # reindex_axis
        result = df.reindex_axis(lrange(15), fill_value=0.0, axis=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        result = df.reindex_axis(lrange(5), fill_value=0.0, axis=1)
        expected = df.reindex(columns=lrange(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df["foo"] = "foo"
        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)
Example #5
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS)
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort(["apply_date", "sid"], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
Example #6
0
    def test_include_na(self):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=self.sparse)
        exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
                         'b': {0: 0.0, 1: 1.0, 2: 0.0}})
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
        exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0},
                            'a': {0: 1.0, 1: 0.0, 2: 0.0},
                            'b': {0: 0.0, 1: 1.0, 2: 0.0}})
        exp_na = exp_na.reindex_axis(['a', 'b', nan], 1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
        exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan])
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def show_predcition_matrix(prediction: pd.DataFrame) -> None:
    prediction = prediction.sort_index()
    prediction = prediction.reindex_axis(sorted(prediction.columns), axis=1)
    obs_pre = {0: {0: 0, 1: 2}, 1: {0: 3, 1: 1}}
    plt.figure()
    axis = plt.gca()
    cmap = colors.ListedColormap(['white', 'cornflowerblue', 'red', 'darkorange'])
    bounds = [-0.5, 0.5, 1.5, 2.5, 3.5]
    norm = colors.BoundaryNorm(bounds, cmap.N)
    heatmap = plt.pcolor(array(prediction), cmap=cmap, norm=norm, edgecolors='k', linewidth=2)
    for y in range(array(prediction.shape)[0]):
        for x in range(array(prediction.shape)[1]):
            if array(prediction)[y, x] == np.nan:
                continue
            if array(prediction)[y, x] >= 0:
                plt.text(x+0.5, y+0.5, array(prediction)[y, x], horizontalalignment='center', verticalalignment='center')
    plt.yticks(arange(0.5, len(prediction.index), 1), prediction.index)
    plt.xticks(arange(0.5, len(prediction.columns), 1), prediction.columns, rotation=70)
    plt.xlabel('Cohesin name', style='oblique')
    plt.ylabel('Dockerin name', style='oblique')
    axis.set_aspect('equal')
    plt.title('Cohesin dockerin cross binding')
    plt.suptitle('0: obs no pred no, 1: obs yes, pred yes\n2: obs no pred yes, 3: obs yes pred no')
    plt.show()
def clean_pw_offday(pw_offday, weeklookup, pw_slp2):
    '''
    Clean pw_offday query without filtering out non-off-days
    invoice-level => day level => customer level
    '''
    print('*'*100)
    print('Cleaning pw_offday query and creating summaries.')    
    print('*'*100)
    deliveries = pw_offday
    
    print('\n\n\nDeclaring functions for later use.')
    def as400_date(dat):
        '''Accepts date as formatted in AS400'''
        dat = str(dat)
        dat = dat[-6:]
        dat = dt.date(dt.strptime(dat, '%y%m%d'))
        return dat
        
    def sum_digits_in_string(digit):
        return sum(int(x) for x in digit if x.isdigit())
        
    print('Mapping Columns.')
    deliveries.rename(columns={'#MIVDT':'Date', '#MDIV#':'Division', '#MIVND':'Invoice', 
                       '#MCUS#':'CustomerId', '#MCALL':'Call', '#MPRIO':'Priority', 
                       '#MCMP':'Warehouse', 'CASES':'Cases', '#MEXT$':'Dollars', 
                       'CSHP':'Ship', '#MSLSP':'SalespersonId', 
                       'CADMBR':'ShipWeekPlan', 'CUDSCC':'Merchandising', 'CONPRM':'OnPremise', 
                       'CSTDTE':'CustomerSetup', '#MCUSY':'CustomerType', 'CCUSTN':'Customer'}, inplace=True)
    pw_slp2.rename(columns={'S2NUM#':'SalespersonId', 'S2NAME':'Salesperson', 'S2DIVR':'SalespersonDirector'}, 
                   inplace=True)
                   
    deliveries = deliveries.merge(pw_slp2, on='SalespersonId', how='left')
    
    print('Mapping Customer types.')
    typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\
                'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\
                'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\
                'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\
                'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\
                '7':'Sports Venue'}
    deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map)    
    
    print('Mapping Warehouse names.')
    whs_map = {1:'Kansas City',2:'Saint Louis',3:'Columbia',4:'Cape Girardeau', 5:'Springfield'}
    deliveries.Warehouse = deliveries.Warehouse.map(whs_map)          
    
    print('Processing dates.')
    deliveries.Date = [as400_date(d) for d in deliveries.Date.astype(str).tolist()]    
    weeklookup['Date'] = [dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist()]
    
    print('Merging on dates with week lookup.')
    deliveries = deliveries.merge(weeklookup, on='Date')
    
    dat = Series(deliveries.Date.tolist())
    deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat])
    
    week_plan = deliveries.ShipWeekPlan.tolist()
    week_shipped = deliveries.ShipWeek.tolist()
    
    print('Using custom logic to derive which days were off-day deliveries.')
    deliveries.Ship = del_days = [str('%07d'% int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist()]
    
    mon = Series([d[-7:][:1] for d in del_days]).map({'1':'M','0':'_'})
    tue = Series([d[-6:][:1] for d in del_days]).map({'1':'T','0':'_'})
    wed = Series([d[-5:][:1] for d in del_days]).map({'1':'W','0':'_'})
    thu = Series([d[-4:][:1] for d in del_days]).map({'1':'R','0':'_'})
    fri = Series([d[-3:][:1] for d in del_days]).map({'1':'F','0':'_'})
    sat = Series([d[-2:][:1] for d in del_days]).map({'1':'S','0':'_'})
    sun = Series([d[-1:][:1] for d in del_days]).map({'1':'U','0':'_'})
    
    deliveries['DeliveryDays'] = del_days = list(itertools.chain.from_iterable([mon + tue + wed + thu + fri + sat + sun]))
    
    weekday = deliveries.Weekday = [d[:3] for d in deliveries.Weekday.astype(str).tolist()]
    _days = DataFrame(data={'Weekday':weekday, 'WeekPlanned':week_plan, 'WeekShipped':week_shipped, 'DelDays':del_days}) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun,
    day_list = _days['WeekPlanned'].tolist()
    _days['WeekPlanned'] = [d if d in ['A','B'] else '' for d in day_list]
    
    _week_actual = _days.WeekShipped.tolist()
    _week_plan = _days['WeekPlanned'] = [ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual,_days.WeekPlanned.tolist())]
    _days['OffWeek'] = _off_week = [p != a for p, a in zip(_week_plan, _week_actual)]
    
    off_mon = [str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday)]
    off_tue = [str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday)]
    off_wed = [str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday)]
    off_thu = [str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday)]
    off_fri = [str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday)]
    off_sat = [str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday)]
    off_sun = [str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday)]
    
    _off_days = DataFrame({'Mon':off_mon, 'Tue':off_tue, 'Wed':off_wed, 'Thu':off_thu, 
                           'Fri':off_fri, 'Sat':off_sat, 'Sun':off_sun, 'OffWeek':_off_week, 'Weekday':weekday})
    _off_days = _off_days[['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Weekday','OffWeek']]                           
    _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (_off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (_off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (_off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True)                
       
    print('Check here if you suspect a bug.')                    
    #check_later = _off_days[_off_days['OffDayDelivery'] == True]
    
    print('Mapping Call Codes.')
    deliveries = pd.concat([deliveries,_off_days[['OffWeek','OffDayDelivery']]], axis=1)
    deliveries.Call = deliveries.Call.map({1:'Customer Call', 2:'ROE/EDI', 3:'Salesperson Call', 4:'Telesales'})
    
    print('Putting Setup Date into proper date format.')
    setup_date = deliveries.CustomerSetup.astype(str).tolist()
    setup_month = Series([d.zfill(4)[:2] for d in setup_date])
    setup_year = Series(["20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date]) #this_century = [int(d[-2:]) < 20 for d in setup_date]
    
    deliveries['CustomerSetup'] = c_setup = [str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)]
    
    print('Defining new customers based on whether they were setup last month or not.')
    if dt.now().month == 1:
        last_month = '12'
    else:
        last_month = str(dt.now().month - 1).zfill(2)
    if dt.now().month == 1:
            this_year = str(dt.now().year - 1)
    else:
        this_year = str(dt.now().year)
    m_y_cutoff = last_month + '-' + this_year
        
    deliveries['NewCustomer'] = [1 if m_y_cutoff == setup else 0 for setup in c_setup]
    deliveries['OffDayDeliveries'] =  deliveries.OffDayDelivery.astype(int)
    
    print('Deriving number of weekly deliveries allotted to each customer.')
    _n_days = deliveries.Ship.astype(str).tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [sum_digits_in_string(n) for n in _n_days]
    _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist()
    _week_ind = deliveries['ShipWeekPlan'].tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [a if w not in ['A','B'] else 0.5 for a, w in zip(_allot, _week_ind)]
    _n_days = deliveries.set_index('CustomerId')['AllottedWeeklyDeliveryDays'].to_dict()
    
    print('\n')
    print('-'*100)    
    print('\n')    
    
    print('Aggregating by Day.')
    agg_funcs_day = {'OffDayDeliveries' : {'Count':max}, 
                 'Date' : {'Count':len_unique},
                 'Cases' : {'Sum':sum, 'Avg':np.mean},
                 'Dollars' : {'Sum':sum, 'Avg':np.mean},
                 'NewCustomer': lambda x: min(x)}
    
    pass_through_cols = ['CustomerId','Customer','Week','Date']
    _agg_byday = DataFrame(deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(drop=False)
    _agg_byday = DataFrame(_agg_byday[['CustomerId','Customer','Week','Date','OffDayDeliveries','NewCustomer','Cases','Dollars']])
    _agg_byday.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns]
    _agg_byday.columns = ['CustomerId','Customer','Week','Date','Delivery','OffDayDelivery','NewCustomer','Cases|Sum','Cases|Avg','Dollars|Sum','Dollars|Avg']
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['CustomerId'].astype(int)
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['AllottedWeeklyDeliveryDays|Count'].map(_n_days)
    
    
    
    print('Aggregating by Week.')
    agg_funcs_week = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x)}
    
    _agg_byweek = DataFrame(_agg_byday.groupby(['CustomerId','Week']).agg(agg_funcs_week)).reset_index(drop=False)
    _agg_byweek.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns]
    
    print('Mapping number of deliveries to Customers.')
    # Map number of total deliveries each week by customer
    # to determine whether a customer with TWR deliveries 
    # got TWF deliveries -- which is an off-day delivery
    # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total}
    _c = _agg_byweek['CustomerId'].astype(str).tolist()
    _w = _agg_byweek['Week'].astype(str).tolist()
    _agg_byweek['_X'] = [c + ',' + w for c,w in zip(_c,_w)]
    by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict()
    
    cid = _agg_byday['CustomerId'].astype(str).tolist()
    wkk = _agg_byday['Week'].astype(str).tolist()
    _agg_byday['N_DeliveriesThisWeek'] = [c + ',' + w for c, w in zip(cid, wkk)]
    _agg_byday['N_DeliveriesThisWeek'] = _agg_byday['N_DeliveriesThisWeek'].map(Series(by_week_map))
    
    
    print('Using custom logic to define Additional Delivery Days.')
    addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'] )
    addl_day_criteria_2 = ( _agg_byday.shift(1)['Week'] == _agg_byday['Week'] )
    addl_day_criteria_3 = ( _agg_byday['OffDayDelivery'] == 1 )
    addl_day_criteria_4 = ( _agg_byday['NewCustomer'] != 1 )
    addl_day_criteria_5 = ( _agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count'] )
    
    _agg_byday['AdditionalDeliveryDays'] = Series(addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int)
    
    
    print('Aggregating by Customer.')    
    agg_funcs_cust = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x),
                      'AllottedWeeklyDeliveryDays|Count': lambda x: max(x),
                      'AdditionalDeliveryDays': lambda x: sum(x),
                      'Dollars|Sum':lambda x: int(sum(x)),
                      'Cases|Sum':lambda x: sum(x) }                                           
    
    _agg_bycust = DataFrame(_agg_byday.groupby(['CustomerId','Customer']).agg(agg_funcs_cust)).reset_index(drop=False)
    _agg_bycust.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns]
    _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1)
    
    _agg_bycust.columns = ['AdditionalDeliveries','AllottedDeliveryDays','Cases',
                           'Customer','CustomerId','Deliveries','Dollars',
                           'NewCustomer','OffDayDeliveries']
    _agg_bycust = _agg_bycust[['CustomerId','Customer','NewCustomer','AllottedDeliveryDays','Deliveries',
                               'OffDayDeliveries','AdditionalDeliveries','Cases','Dollars']]
    
    
    print('Mapping useful Customer attributes.')
    attr = ['CustomerId','Warehouse','OnPremise','CustomerSetup','CustomerType','ShipWeekPlan','DeliveryDays']
    customer_attributes = deliveries[attr].drop_duplicates().reset_index(drop=True)
    
    _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates()
    _agg_bycust = _agg_bycust.sort_values(by=['AdditionalDeliveries','OffDayDeliveries'], ascending=False).reset_index(drop=True)
    
    _agg_bycust['CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries']
    _agg_bycust['DollarsPerDelivery'] = round(_agg_bycust['Dollars'] / _agg_bycust['Deliveries'],2)
    
    _agg_bycust['OffDayDeliveries/Deliveries'] = round(_agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'],2)
    _agg_bycust['AdditionalDeliveries/Deliveries'] = round(_agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'],2)
    
    
    print('Mapping Tiers based on allotted delivery days.')
    tier_map = {0:'No Delivery Days Assigned',0.5:'Tier 4', 1:'Tier 3', 2:'Tier 2', 3:'Tier 1', 4:'Tier 1', 5:'Tier 1', 6:'Tier 1', 7:'Tier 1'}
    _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map)
    
    addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist()
    tier = _agg_bycust['Tier'].tolist()
    
    _agg_bycust['AdditionalDeliveries'] = [addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier)]
    
    _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(np.nan, '')
    
    
    print('Creating Overall Summary.')
    agg_funcs_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    overall_summary = DataFrame(_agg_bycust.groupby(['Tier','Warehouse']).agg(agg_funcs_summary))
    overall_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns]
    overall_summary = overall_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    overall_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
    
    print('Creating High-Level Summary.\n\n\n')
    agg_funcs_HL_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    high_level_summary = DataFrame(_agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary))
    high_level_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns]
    high_level_summary = high_level_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    high_level_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
                                       
    print('*'*100)
    print('Finished creating summaries at high level, overall, and aggregating by customer and by day.')
    print('*'*100)    

    return high_level_summary, overall_summary, _agg_bycust, _agg_byday, deliveries
Example #9
0
df.sort(columns=['one','two'], ascending=[0,1])

prices = [101.0,102.0,103.0]
tickers = ['GOOG','AAPL']
data = [v for v in itertools.product(tickers,prices)]
dates = pandas.date_range('2013-01-03',periods=3)
df = DataFrame(data, columns=['ticker','price'])
df['dates'] = dates.append(dates)
df
df.pivot(index='dates',columns='ticker',values='price')

original = DataFrame([[1,1],[2,2],[3.0,3]],index=['a','b','c'], columns=['one','two'])
original.reindex(index=['b','c','d'])
different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two'])
original.reindex_like(different)
original.reindex_axis(['two','one'], axis = 1)

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three'])
left.merge(right,on='one') # Same as how='inner'
left.merge(right,on='one', how='left')
left.merge(right,on='one', how='right')
left.merge(right,on='one', how='outer')

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
left
right = DataFrame([[nan,12],[13,nan],[nan,8]],columns=['one','two'],index=[1,2,3])
right
left.update(right) # Updates values in left
left
df.sort(columns=["one", "two"], ascending=[0, 1])

prices = [101.0, 102.0, 103.0]
tickers = ["GOOG", "AAPL"]
data = [v for v in itertools.product(tickers, prices)]
dates = pandas.date_range("2013-01-03", periods=3)
df = DataFrame(data, columns=["ticker", "price"])
df["dates"] = dates.append(dates)
df
df.pivot(index="dates", columns="ticker", values="price")

original = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["a", "b", "c"], columns=["one", "two"])
original.reindex(index=["b", "c", "d"])
different = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["c", "d", "e"], columns=["one", "two"])
original.reindex_like(different)
original.reindex_axis(["two", "one"], axis=1)

left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"])
right = DataFrame([[1, 2], [3, 4], [7, 8]], columns=["one", "three"])
left.merge(right, on="one")  # Same as how='inner'
left.merge(right, on="one", how="left")
left.merge(right, on="one", how="right")
left.merge(right, on="one", how="outer")

left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"])
left
right = DataFrame([[nan, 12], [13, nan], [nan, 8]], columns=["one", "two"], index=[1, 2, 3])
right
left.update(right)  # Updates values in left
left
def aggregate_unsaleables_by_product(pwunsale_tidy, pwrct1_tidy, pw_ytdprod, pw_ytdsupp):
    '''
    Aggregates unsaleables returns & dumps by product.
    
    Takes tidy data as input (previous function).
    '''
    pwunsale = pwunsale_tidy
    pwrct1 = pwrct1_tidy    
    
    print('Expect to see the following. \n\n\n')
    tot_unsaleable = np.sum(pwrct1_tidy['ExtCost'])
    returned = np.sum(pwunsale_tidy['ExtCost'])
    
    print('Total unsaleables expected:  $%.2f' % tot_unsaleable) 
    print('Total returns expected:  $%.2f' % returned)
    
    print('\n\n\nAggregating RCT1 data by Product.')
    agg_funcs_product_rct = {'CasesUnsaleable': {'avg':np.mean, 'sum':np.sum},
                             'ExtCost': {'avg':np.mean, 'sum':np.sum}}
    
    grp_cols = ['SupplierId','Supplier','ProductId','Product']
    _agg_byproduct_rct = DataFrame(pwrct1.groupby(grp_cols).agg(agg_funcs_product_rct).reset_index(drop=False))
    _agg_byproduct_rct.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_rct.columns]                  
    _agg_byproduct_rct = _agg_byproduct_rct.reindex_axis(sorted(_agg_byproduct_rct.columns), axis=1)
    _agg_byproduct_rct.columns = ['CasesUnsaleable|avg', 'CasesUnsaleable|sum', 
                                  'DollarsUnsaleable|avg', 'DollarsUnsaleable|sum',
                                  'Product', 'ProductId', 
                                  'Supplier', 'SupplierId']
                                  
    print('\nUpdated unsaleables: $%.2f \n' % np.sum(_agg_byproduct_rct['DollarsUnsaleable|sum']))
    
    print('Aggregating MTC data by Product.')
    agg_funcs_product_mtc = {'CasesReturned': {'avg':np.mean, 'sum':np.sum},
                         'ExtCost': {'avg':np.mean, 'sum':np.sum}}
    
    _agg_byproduct_mtc = DataFrame(pwunsale.groupby(grp_cols).agg(agg_funcs_product_mtc).reset_index(drop=False))
    _agg_byproduct_mtc.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_mtc.columns]                  
    _agg_byproduct_mtc = _agg_byproduct_mtc.reindex_axis(sorted(_agg_byproduct_mtc.columns), axis=1)
    _agg_byproduct_mtc.columns = ['CasesReturned|avg', 'CasesReturned|sum', 
                                  'DollarsReturned|avg', 'DollarsReturned|sum',
                                  'Product', 'ProductId', 
                                  'Supplier', 'SupplierId']
                                  
    print('\nUpdated returns: $%.2f \n' % np.sum(_agg_byproduct_mtc['DollarsReturned|sum']))
    
    print('Combining RCT and MTC data.')
    _agg_byproduct_combined = _agg_byproduct_rct.merge(_agg_byproduct_mtc.drop(labels=['Supplier','Product'], axis=1), on=['SupplierId','ProductId'], how='outer')
    _agg_byproduct_combined[['ProductId','SupplierId']] = _agg_byproduct_combined[['ProductId','SupplierId']].astype(np.int)
    
    print('Merging in Directors on the SupplierId field.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(directors[['SupplierId','Director']], on='SupplierId',how='left')
    
    print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']))
    print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum']))
    
    print('Reordering columns.')
    reorder_cols = ['Director', 'SupplierId', 'Supplier', 
                    'ProductId', 'Product',
                    'DollarsUnsaleable|sum', 'CasesUnsaleable|sum',
                    'DollarsUnsaleable|avg', 'CasesUnsaleable|avg',
                    'DollarsReturned|sum', 'CasesReturned|sum',
                    'DollarsReturned|avg', 'CasesReturned|avg']
    _agg_byproduct_combined = _agg_byproduct_combined[reorder_cols]
    
    print('Mapping in attribute columns.')
    _attrs = ['ProductId', 'Size', 'Class', 'QPC']
    _attributes = pwrct1[_attrs].drop_duplicates(subset='ProductId')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(_attributes, on='ProductId', how='left')
    
    print('Mapping in YTD sales by Product.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdprod, on='ProductId', how='left')
    
    print('Mapping in YTD sales by Supplier.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdsupp, on='SupplierId', how='left')
    
    print('Deriving percenteage of sales by Product.')
    _agg_byproduct_combined['PercentSales|byproduct'] = np.divide(_agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|byproduct'])
    
    print('Deriving percenteage of sales by Suppplier.')
    _agg_byproduct_combined['PercentSales|bysupplier'] = np.divide(_agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|bysupplier'])    
        
    print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']))
    print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum']))
    
    print('Checking for and dropping Duplicates.')
    _agg_byproduct_combined.drop_duplicates(inplace=True)
    
    print('Replacing NaN values with zeros for readability.')
    _agg_byproduct_combined.fillna(0, inplace=True)
    
    print('Sorting in descending order on total unsaleables.\n\n\n')
    _agg_byproduct_combined.sort_values('DollarsUnsaleable|sum', ascending=False, inplace=True)

    print('Resetting index.')
    _agg_byproduct_combined.reset_index(inplace=True, drop=True)    
    
    print('Compare values below to originals. \n\n\n')
    new_tot_unsaleable = np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum'])
    new_returned = np.sum(_agg_byproduct_combined['DollarsReturned|sum'])
    
    print('Original Unsaleables:  $%.2f \nPost-Processing Unsaleables:  $%.2f \n' % (tot_unsaleable, new_tot_unsaleable)) 
    print('Original Returns:  $%.2f \nPost-Processing Returns:  $%.2f \n\n\n' % (returned, new_returned)) 
    
    print('*'*100)
    print('If the numbers above do not match then there is a bug in the program.')
    print('*'*100)
    
    return _agg_byproduct_combined