Beispiel #1
0
def sum_df_sequence(seq: Iterable[DataFrame],
                    fill_value: Union[int, float] = 0) -> DataFrame:
    """Sums over a sequence of DataFrames, even if they have different indexes or columns, filling in 0 (or a value of
    your choice) for missing rows or columns. Useful when you have a sequence of DataFrames which are supposed to have
    the same indexes and columns but might be missing a few values.

    Args:
        seq (Iterable[pandas.DataFrame]): Any iterable of DataFrame type, ordered or unordered.
        fill_value (Union[int, float], optional): Defaults to ``0``. The value to use for missing cells.

    Returns:
        pandas.DataFrame: The sum over all items in seq.
    """
    common_index = Index([])
    common_columns = Index([])
    accumulator = DataFrame()

    for df in seq:
        if not df.index.equals(common_index):
            common_index |= df.index
            accumulator = accumulator.reindex_axis(common_index,
                                                   axis=0,
                                                   fill_value=fill_value)
            df = df.reindex_axis(common_index, axis=0, fill_value=fill_value)
        if not df.columns.equals(common_columns):
            common_columns |= df.columns
            accumulator = accumulator.reindex_axis(common_columns,
                                                   axis=1,
                                                   fill_value=fill_value)
            df = df.reindex_axis(common_columns, axis=1, fill_value=fill_value)
        accumulator += df
    return accumulator
Beispiel #2
0
 def _prepare_observations(cls, feature_extractor, text_classes, assessments):
     texts, classes = zip(*text_classes)
     
     features_lists = {fn:[] for fn in feature_extractor.FEATURES}
     
     for text in texts:
         
         features = feature_extractor.extract(text)
         assert features_lists.keys() == features.keys()
         
         for feature_name, value in features.items():
             features_lists[feature_name].append(value)
         
     
     features_pd = DataFrame(features_lists)
     # ensure proper column order
     features_pd.reindex_axis(feature_extractor.FEATURES, axis=1)
     
     # Convert classes to a categorical
     ass_map = {c:i for i, c in enumerate(assessments)} # Int map
     ass_int = [ass_map[c] for c in classes] # Convert to int
     classes_pd = DataFrame(
             {'class': Categorical(ass_int, levels=assessments)},
             index = list(range(1, len(ass_int)+1))) #
     
     return features_pd, classes_pd
Beispiel #3
0
def parse_splits(fname_or_df, outname=None):
    """From csv with columns name, break, yield a df with start, end and length.

    Parameters:
        fname_or_df:    filename. If df is given, recalculate length.

    Returns: df with name, start, end, length. Units are in milliseconds.
    """
    if type(fname_or_df) == str: log = DataFrame.from_csv(fname_or_df, index_col=None)
    else: log = fname_or_df
    
    if 'break' in log:
        df = DataFrame({'name':  log['name'][0:-1].values,
                        'start': log['break'][0:-1].values,
                        'end':   log['break'][1:].values,
                        'order': range(len(log) - 1)})
    elif 'start' in log: df = log
    else: raise Exception('needs to either have break, or start and end columns')

    # if splits are given in time, convert to millisecond
    if ':' in str(df['start'][0]): 
        for col in ['start', 'end']: df[col] = df[col].apply(time2ms)

    # calculate length
    df['length'] = df['end'] - df['start']
    assert all(col in df for col in OUTCOLS)

    if outname:
        df.to_csv(outname, index=None)
    
    return df.reindex_axis(OUTCOLS, axis=1)
def __reindex_cols_old_api(df: pd.DataFrame,
                           partial_ordering: Sequence[str]) -> pd.DataFrame:
    return df.reindex_axis(sorted(
        df.columns,
        key=lambda col_name: __element_order(col_name, partial_ordering)),
                           axis=1,
                           copy=False)
Beispiel #5
0
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(list(range(15)))
        assert np.isnan(result.values[-5:]).all()

        result = df.reindex(range(15), fill_value=0)
        expected = df.reindex(range(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=range(5), fill_value=0.)
        expected = df.copy()
        expected[4] = 0.
        assert_frame_equal(result, expected)

        result = df.reindex(columns=range(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=range(5), fill_value='foo')
        expected = df.copy()
        expected[4] = 'foo'
        assert_frame_equal(result, expected)

        # reindex_axis
        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex_axis(range(15), fill_value=0., axis=0)
        expected = df.reindex(range(15)).fillna(0)
        assert_frame_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex_axis(range(5), fill_value=0., axis=1)
        expected = df.reindex(columns=range(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df['foo'] = 'foo'
        result = df.reindex(range(15), fill_value=0)
        expected = df.reindex(range(15)).fillna(0)
        assert_frame_equal(result, expected)
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(lrange(15))
        assert np.isnan(result.values[-5:]).all()

        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=lrange(5), fill_value=0.)
        expected = df.copy()
        expected[4] = 0.
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value='foo')
        expected = df.copy()
        expected[4] = 'foo'
        assert_frame_equal(result, expected)

        # reindex_axis
        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex_axis(lrange(15), fill_value=0., axis=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            result = df.reindex_axis(lrange(5), fill_value=0., axis=1)
        expected = df.reindex(columns=lrange(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df['foo'] = 'foo'
        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(lrange(15))
        self.assertTrue(np.isnan(result.values[-5:]).all())

        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=lrange(5), fill_value=0.0)
        expected = df.copy()
        expected[4] = 0.0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value="foo")
        expected = df.copy()
        expected[4] = "foo"
        assert_frame_equal(result, expected)

        # reindex_axis
        result = df.reindex_axis(lrange(15), fill_value=0.0, axis=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        result = df.reindex_axis(lrange(5), fill_value=0.0, axis=1)
        expected = df.reindex(columns=lrange(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df["foo"] = "foo"
        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)
Beispiel #8
0
    def test_reindex_fill_value(self):
        df = DataFrame(np.random.randn(10, 4))

        # axis=0
        result = df.reindex(lrange(15))
        self.assertTrue(np.isnan(result.values[-5:]).all())

        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        # axis=1
        result = df.reindex(columns=lrange(5), fill_value=0.)
        expected = df.copy()
        expected[4] = 0.
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value=0)
        expected = df.copy()
        expected[4] = 0
        assert_frame_equal(result, expected)

        result = df.reindex(columns=lrange(5), fill_value='foo')
        expected = df.copy()
        expected[4] = 'foo'
        assert_frame_equal(result, expected)

        # reindex_axis
        result = df.reindex_axis(lrange(15), fill_value=0., axis=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)

        result = df.reindex_axis(lrange(5), fill_value=0., axis=1)
        expected = df.reindex(columns=lrange(5)).fillna(0)
        assert_frame_equal(result, expected)

        # other dtypes
        df['foo'] = 'foo'
        result = df.reindex(lrange(15), fill_value=0)
        expected = df.reindex(lrange(15)).fillna(0)
        assert_frame_equal(result, expected)
Beispiel #9
0
    def test_include_na(self):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=self.sparse)
        exp = DataFrame({
            'a': {
                0: 1,
                1: 0,
                2: 0
            },
            'b': {
                0: 0,
                1: 1,
                2: 0
            }
        },
                        dtype=np.uint8)
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
        exp_na = DataFrame(
            {
                nan: {
                    0: 0,
                    1: 0,
                    2: 1
                },
                'a': {
                    0: 1,
                    1: 0,
                    2: 0
                },
                'b': {
                    0: 0,
                    1: 1,
                    2: 0
                }
            },
            dtype=np.uint8)
        exp_na = exp_na.reindex_axis(['a', 'b', nan], 1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
        exp_just_na = DataFrame(Series(1, index=[0]),
                                columns=[nan],
                                dtype=np.uint8)
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Beispiel #10
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(index=DatetimeIndex([]), columns=ADJUSTMENT_COLUMNS)
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort(["apply_date", "sid"], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
Beispiel #11
0
def aggregate_indicators(df, index_cols, unstack_col, group_col, metric_col,
                         label):
    df = df.copy()
    agg = DataFrame()
    aggw = DataFrame(index=df['indicator'].unique(),
                     columns=df['grade'].unique())
    grp = df.set_index(index_cols + [unstack_col])[metric_col]\
        .unstack(unstack_col)\
        .groupby(level=group_col, group_keys=False)
    for grade, grp_df in grp:
        weights, grp_df = combine_subjects_by_rules(grp_df,
                                                    constituent=int(grade))
        grp_df = grp_df.to_frame(metric_col).reset_index()
        grp_df[unstack_col] = label
        agg = agg.append(grp_df)
        aggw[grade] = weights
    agg = agg.reindex_axis(df.columns, axis=1)

    return agg, aggw.fillna(0.0) / aggw.sum()
Beispiel #12
0
    def __init__(self, column, baseline, adjustments=None):
        self.column = column
        self.baseline = baseline.values
        self.dates = baseline.index
        self.assets = baseline.columns

        if adjustments is None:
            adjustments = DataFrame(
                index=DatetimeIndex([]),
                columns=ADJUSTMENT_COLUMNS,
            )
        else:
            # Ensure that columns are in the correct order.
            adjustments = adjustments.reindex_axis(ADJUSTMENT_COLUMNS, axis=1)
            adjustments.sort(['apply_date', 'sid'], inplace=True)

        self.adjustments = adjustments
        self.adjustment_apply_dates = DatetimeIndex(adjustments.apply_date)
        self.adjustment_end_dates = DatetimeIndex(adjustments.end_date)
        self.adjustment_sids = Int64Index(adjustments.sid)
Beispiel #13
0
    def test_include_na(self):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=self.sparse)
        exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
                         'b': {0: 0.0, 1: 1.0, 2: 0.0}})
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
        exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0},
                            'a': {0: 1.0, 1: 0.0, 2: 0.0},
                            'b': {0: 0.0, 1: 1.0, 2: 0.0}})
        exp_na = exp_na.reindex_axis(['a', 'b', nan], 1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
        exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan])
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Beispiel #14
0
def get_post_hocs_as_df(df: pd.DataFrame,
                        dm: str,
                        factor: str,
                        tag: str = '') -> pd.DataFrame:
    table = sm.stats.multicomp.pairwise_tukeyhsd(df.dropna()[dm],
                                                 df.dropna()[factor])
    pvalues = pvalues_from_tukeyhsd(table)
    table_csv = table._results_table.as_csv()
    table_csv = table_csv[table_csv.find('\n') + 1:].replace(' ', '')
    df = pd.read_csv(StringIO(table_csv), delimiter=',')
    c = df.columns.tolist()
    df['tag'] = str(tag)
    df['p'] = [float(p) for p in pvalues
               ] if type(pvalues) is not float else [float(str(pvalues))]
    df['dv'] = dm
    df['factor'] = factor
    df['sig'] = [p_desc2(p) for p in df['p']]
    df = df.reindex_axis(['tag', 'dv', 'factor'] + c + ['p', 'sig'], axis=1)
    df.drop(['reject'], axis=1, inplace=True)
    return df
def show_predcition_matrix(prediction: pd.DataFrame) -> None:
    prediction = prediction.sort_index()
    prediction = prediction.reindex_axis(sorted(prediction.columns), axis=1)
    obs_pre = {0: {0: 0, 1: 2}, 1: {0: 3, 1: 1}}
    plt.figure()
    axis = plt.gca()
    cmap = colors.ListedColormap(['white', 'cornflowerblue', 'red', 'darkorange'])
    bounds = [-0.5, 0.5, 1.5, 2.5, 3.5]
    norm = colors.BoundaryNorm(bounds, cmap.N)
    heatmap = plt.pcolor(array(prediction), cmap=cmap, norm=norm, edgecolors='k', linewidth=2)
    for y in range(array(prediction.shape)[0]):
        for x in range(array(prediction.shape)[1]):
            if array(prediction)[y, x] == np.nan:
                continue
            if array(prediction)[y, x] >= 0:
                plt.text(x+0.5, y+0.5, array(prediction)[y, x], horizontalalignment='center', verticalalignment='center')
    plt.yticks(arange(0.5, len(prediction.index), 1), prediction.index)
    plt.xticks(arange(0.5, len(prediction.columns), 1), prediction.columns, rotation=70)
    plt.xlabel('Cohesin name', style='oblique')
    plt.ylabel('Dockerin name', style='oblique')
    axis.set_aspect('equal')
    plt.title('Cohesin dockerin cross binding')
    plt.suptitle('0: obs no pred no, 1: obs yes, pred yes\n2: obs no pred yes, 3: obs yes pred no')
    plt.show()
Beispiel #16
0
    def FindKeywords(self, support=10, ngrams=1):
        """Identify and rank keywords within target and non-target sets."""
        if type(ngrams) == int:
            ngrams = (1, ngrams)
        if type(ngrams) == list:
            ngrams = tuple(ngrams)

        frequent_words = GetDTM(self.search_set, min_df=support, ngram_range=ngrams).terms
        dtm = GetDTM(self.search_set, min_df=1, vocabulary=frequent_words)
        total_dc = DocCounts(frequent_words, dtm, None)
        target_dc = DocCounts(frequent_words, dtm, self.target_docnames)
        ntarget = len(self.target_docnames)
        nnontarget = len(self.nontarget_docnames)

        alpha1 = 1
        alpha0 = 1
        ranked_by = 'll'
        target_wordlist = []
        nontarget_wordlist = []
        target_stats = defaultdict(list)
        nontarget_stats = defaultdict(list)
        for word in frequent_words:
            n1 = target_dc[word]
            n0 = total_dc[word] - target_dc[word]
            p1 = (float(n1)/ntarget)*100
            p0 = (float(n0)/nnontarget)*100
            n1_not = ntarget - n1
            n0_not = nnontarget - n0
            ll = (lgamma(n1+alpha1) + lgamma(n0+alpha0) - lgamma(n1+alpha1+n0+alpha0)) + (lgamma(n1_not+alpha1) + lgamma(n0_not+alpha0) - lgamma(n1_not+alpha1+n0_not+alpha0))
            if hasattr(self, 'reference_keywords'):
                r_count = 0
                if word in self.reference_keywords:
                    r_count = self.reference_stats.loc[word, 'counts']
            else:
                r_count = None

            if p0 > p1:
                p1, p0 = p0, p1
                n1, n0 = n0, n1
                nontarget_wordlist.append(word)
                nontarget_stats['n1'].append(n1)
                nontarget_stats['n0'].append(n0)
                nontarget_stats['p1'].append(p1)
                nontarget_stats['p0'].append(p0)
                nontarget_stats['ll'].append(ll)
                nontarget_stats['T'].append(n0)
                nontarget_stats['S'].append(n0+n1)
                nontarget_stats['R'].append(r_count)
            else:
                target_wordlist.append(word)
                target_stats['n1'].append(n1)
                target_stats['n0'].append(n0)
                target_stats['p1'].append(p1)
                target_stats['p0'].append(p0)
                target_stats['ll'].append(ll)
                target_stats['T'].append(n1)
                target_stats['S'].append(n0+n1)
                target_stats['R'].append(r_count)

        target_stats = DataFrame(target_stats, index=target_wordlist)
        target_stats = target_stats.reindex_axis(['ll', 'n1', 'n0', 'p1', 'p0','T','S','R'], axis=1)
        target_stats.sort_values(ranked_by, ascending=False, inplace=True)
        nontarget_stats = DataFrame(nontarget_stats, index=nontarget_wordlist)
        nontarget_stats = nontarget_stats.reindex_axis(['ll', 'n1', 'n0', 'p1', 'p0','T','S','R'], axis=1)
        nontarget_stats.sort_values(ranked_by, ascending=False, inplace=True)

        if hasattr(self, 'reference_keywords'):
            ref_words = self.reference_keywords
            ref_dtm = GetDTM(self.search_set, min_df=1, vocabulary=ref_words)
            total_dc = DocCounts(ref_words, ref_dtm, None)
            target_dc = DocCounts(ref_words, ref_dtm, self.target_docnames)
            ref_T = []
            ref_S = []
            for word in ref_words:
                ref_T.append(target_dc[word])
                ref_S.append(total_dc[word])
            self.reference_stats['T'] = ref_T
            self.reference_stats['S'] = ref_S
            self.reference_stats['R'] = self.reference_stats['counts']

        self.target_stats = target_stats
        self.nontarget_stats = nontarget_stats
        self.target_keywords = list(target_stats.index)
        self.nontarget_keywords = list(nontarget_stats.index)
        print("{} target set keywords found".format(len(self.target_keywords)))
        print("{} non-target set keywords found".format(len(self.nontarget_keywords)))
def clean_pw_offday(pw_offday, weeklookup, pw_slp2):
    '''
    Clean pw_offday query without filtering out non-off-days
    invoice-level => day level => customer level
    '''
    print('*'*100)
    print('Cleaning pw_offday query and creating summaries.')    
    print('*'*100)
    deliveries = pw_offday
    
    print('\n\n\nDeclaring functions for later use.')
    def as400_date(dat):
        '''Accepts date as formatted in AS400'''
        dat = str(dat)
        dat = dat[-6:]
        dat = dt.date(dt.strptime(dat, '%y%m%d'))
        return dat
        
    def sum_digits_in_string(digit):
        return sum(int(x) for x in digit if x.isdigit())
        
    print('Mapping Columns.')
    deliveries.rename(columns={'#MIVDT':'Date', '#MDIV#':'Division', '#MIVND':'Invoice', 
                       '#MCUS#':'CustomerId', '#MCALL':'Call', '#MPRIO':'Priority', 
                       '#MCMP':'Warehouse', 'CASES':'Cases', '#MEXT$':'Dollars', 
                       'CSHP':'Ship', '#MSLSP':'SalespersonId', 
                       'CADMBR':'ShipWeekPlan', 'CUDSCC':'Merchandising', 'CONPRM':'OnPremise', 
                       'CSTDTE':'CustomerSetup', '#MCUSY':'CustomerType', 'CCUSTN':'Customer'}, inplace=True)
    pw_slp2.rename(columns={'S2NUM#':'SalespersonId', 'S2NAME':'Salesperson', 'S2DIVR':'SalespersonDirector'}, 
                   inplace=True)
                   
    deliveries = deliveries.merge(pw_slp2, on='SalespersonId', how='left')
    
    print('Mapping Customer types.')
    typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\
                'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\
                'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\
                'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\
                'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\
                '7':'Sports Venue'}
    deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map)    
    
    print('Mapping Warehouse names.')
    whs_map = {1:'Kansas City',2:'Saint Louis',3:'Columbia',4:'Cape Girardeau', 5:'Springfield'}
    deliveries.Warehouse = deliveries.Warehouse.map(whs_map)          
    
    print('Processing dates.')
    deliveries.Date = [as400_date(d) for d in deliveries.Date.astype(str).tolist()]    
    weeklookup['Date'] = [dt.date(dt.strptime(w_Dat, '%m/%d/%Y')) for w_Dat in weeklookup['Date'].astype(str).tolist()]
    
    print('Merging on dates with week lookup.')
    deliveries = deliveries.merge(weeklookup, on='Date')
    
    dat = Series(deliveries.Date.tolist())
    deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat])
    
    week_plan = deliveries.ShipWeekPlan.tolist()
    week_shipped = deliveries.ShipWeek.tolist()
    
    print('Using custom logic to derive which days were off-day deliveries.')
    deliveries.Ship = del_days = [str('%07d'% int(str(day).zfill(0))) for day in deliveries.Ship.astype(str).tolist()]
    
    mon = Series([d[-7:][:1] for d in del_days]).map({'1':'M','0':'_'})
    tue = Series([d[-6:][:1] for d in del_days]).map({'1':'T','0':'_'})
    wed = Series([d[-5:][:1] for d in del_days]).map({'1':'W','0':'_'})
    thu = Series([d[-4:][:1] for d in del_days]).map({'1':'R','0':'_'})
    fri = Series([d[-3:][:1] for d in del_days]).map({'1':'F','0':'_'})
    sat = Series([d[-2:][:1] for d in del_days]).map({'1':'S','0':'_'})
    sun = Series([d[-1:][:1] for d in del_days]).map({'1':'U','0':'_'})
    
    deliveries['DeliveryDays'] = del_days = list(itertools.chain.from_iterable([mon + tue + wed + thu + fri + sat + sun]))
    
    weekday = deliveries.Weekday = [d[:3] for d in deliveries.Weekday.astype(str).tolist()]
    _days = DataFrame(data={'Weekday':weekday, 'WeekPlanned':week_plan, 'WeekShipped':week_shipped, 'DelDays':del_days}) #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun,
    day_list = _days['WeekPlanned'].tolist()
    _days['WeekPlanned'] = [d if d in ['A','B'] else '' for d in day_list]
    
    _week_actual = _days.WeekShipped.tolist()
    _week_plan = _days['WeekPlanned'] = [ship_week if plan_week == '' else plan_week for ship_week, plan_week in zip(_week_actual,_days.WeekPlanned.tolist())]
    _days['OffWeek'] = _off_week = [p != a for p, a in zip(_week_plan, _week_actual)]
    
    off_mon = [str('M' not in d and w == 'Mon')[:1] for d, w in zip(del_days, weekday)]
    off_tue = [str('T' not in d and w == 'Tue')[:1] for d, w in zip(del_days, weekday)]
    off_wed = [str('W' not in d and w == 'Wed')[:1] for d, w in zip(del_days, weekday)]
    off_thu = [str('R' not in d and w == 'Thu')[:1] for d, w in zip(del_days, weekday)]
    off_fri = [str('F' not in d and w == 'Fri')[:1] for d, w in zip(del_days, weekday)]
    off_sat = [str('S' not in d and w == 'Sat')[:1] for d, w in zip(del_days, weekday)]
    off_sun = [str('U' not in d and w == 'Sun')[:1] for d, w in zip(del_days, weekday)]
    
    _off_days = DataFrame({'Mon':off_mon, 'Tue':off_tue, 'Wed':off_wed, 'Thu':off_thu, 
                           'Fri':off_fri, 'Sat':off_sat, 'Sun':off_sun, 'OffWeek':_off_week, 'Weekday':weekday})
    _off_days = _off_days[['Mon','Tue','Wed','Thu','Fri','Sat','Sun','Weekday','OffWeek']]                           
    _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (_off_days['Tue'] == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (_off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (_off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True)                
       
    print('Check here if you suspect a bug.')                    
    #check_later = _off_days[_off_days['OffDayDelivery'] == True]
    
    print('Mapping Call Codes.')
    deliveries = pd.concat([deliveries,_off_days[['OffWeek','OffDayDelivery']]], axis=1)
    deliveries.Call = deliveries.Call.map({1:'Customer Call', 2:'ROE/EDI', 3:'Salesperson Call', 4:'Telesales'})
    
    print('Putting Setup Date into proper date format.')
    setup_date = deliveries.CustomerSetup.astype(str).tolist()
    setup_month = Series([d.zfill(4)[:2] for d in setup_date])
    setup_year = Series(["20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:] for s in setup_date]) #this_century = [int(d[-2:]) < 20 for d in setup_date]
    
    deliveries['CustomerSetup'] = c_setup = [str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)]
    
    print('Defining new customers based on whether they were setup last month or not.')
    if dt.now().month == 1:
        last_month = '12'
    else:
        last_month = str(dt.now().month - 1).zfill(2)
    if dt.now().month == 1:
            this_year = str(dt.now().year - 1)
    else:
        this_year = str(dt.now().year)
    m_y_cutoff = last_month + '-' + this_year
        
    deliveries['NewCustomer'] = [1 if m_y_cutoff == setup else 0 for setup in c_setup]
    deliveries['OffDayDeliveries'] =  deliveries.OffDayDelivery.astype(int)
    
    print('Deriving number of weekly deliveries allotted to each customer.')
    _n_days = deliveries.Ship.astype(str).tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [sum_digits_in_string(n) for n in _n_days]
    _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist()
    _week_ind = deliveries['ShipWeekPlan'].tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [a if w not in ['A','B'] else 0.5 for a, w in zip(_allot, _week_ind)]
    _n_days = deliveries.set_index('CustomerId')['AllottedWeeklyDeliveryDays'].to_dict()
    
    print('\n')
    print('-'*100)    
    print('\n')    
    
    print('Aggregating by Day.')
    agg_funcs_day = {'OffDayDeliveries' : {'Count':max}, 
                 'Date' : {'Count':len_unique},
                 'Cases' : {'Sum':sum, 'Avg':np.mean},
                 'Dollars' : {'Sum':sum, 'Avg':np.mean},
                 'NewCustomer': lambda x: min(x)}
    
    pass_through_cols = ['CustomerId','Customer','Week','Date']
    _agg_byday = DataFrame(deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(drop=False)
    _agg_byday = DataFrame(_agg_byday[['CustomerId','Customer','Week','Date','OffDayDeliveries','NewCustomer','Cases','Dollars']])
    _agg_byday.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns]
    _agg_byday.columns = ['CustomerId','Customer','Week','Date','Delivery','OffDayDelivery','NewCustomer','Cases|Sum','Cases|Avg','Dollars|Sum','Dollars|Avg']
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['CustomerId'].astype(int)
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday['AllottedWeeklyDeliveryDays|Count'].map(_n_days)
    
    
    
    print('Aggregating by Week.')
    agg_funcs_week = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x)}
    
    _agg_byweek = DataFrame(_agg_byday.groupby(['CustomerId','Week']).agg(agg_funcs_week)).reset_index(drop=False)
    _agg_byweek.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns]
    
    print('Mapping number of deliveries to Customers.')
    # Map number of total deliveries each week by customer
    # to determine whether a customer with TWR deliveries 
    # got TWF deliveries -- which is an off-day delivery
    # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total}
    _c = _agg_byweek['CustomerId'].astype(str).tolist()
    _w = _agg_byweek['Week'].astype(str).tolist()
    _agg_byweek['_X'] = [c + ',' + w for c,w in zip(_c,_w)]
    by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict()
    
    cid = _agg_byday['CustomerId'].astype(str).tolist()
    wkk = _agg_byday['Week'].astype(str).tolist()
    _agg_byday['N_DeliveriesThisWeek'] = [c + ',' + w for c, w in zip(cid, wkk)]
    _agg_byday['N_DeliveriesThisWeek'] = _agg_byday['N_DeliveriesThisWeek'].map(Series(by_week_map))
    
    
    print('Using custom logic to define Additional Delivery Days.')
    addl_day_criteria_1 = ( _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'] )
    addl_day_criteria_2 = ( _agg_byday.shift(1)['Week'] == _agg_byday['Week'] )
    addl_day_criteria_3 = ( _agg_byday['OffDayDelivery'] == 1 )
    addl_day_criteria_4 = ( _agg_byday['NewCustomer'] != 1 )
    addl_day_criteria_5 = ( _agg_byday['N_DeliveriesThisWeek'] > _agg_byday['AllottedWeeklyDeliveryDays|Count'] )
    
    _agg_byday['AdditionalDeliveryDays'] = Series(addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3 & addl_day_criteria_4 & addl_day_criteria_5).astype(int)
    
    
    print('Aggregating by Customer.')    
    agg_funcs_cust = {'OffDayDelivery' : {'Count':sum},
                      'Delivery' : {'Count':sum},
                      'NewCustomer' : lambda x: min(x),
                      'AllottedWeeklyDeliveryDays|Count': lambda x: max(x),
                      'AdditionalDeliveryDays': lambda x: sum(x),
                      'Dollars|Sum':lambda x: int(sum(x)),
                      'Cases|Sum':lambda x: sum(x) }                                           
    
    _agg_bycust = DataFrame(_agg_byday.groupby(['CustomerId','Customer']).agg(agg_funcs_cust)).reset_index(drop=False)
    _agg_bycust.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns]
    _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1)
    
    _agg_bycust.columns = ['AdditionalDeliveries','AllottedDeliveryDays','Cases',
                           'Customer','CustomerId','Deliveries','Dollars',
                           'NewCustomer','OffDayDeliveries']
    _agg_bycust = _agg_bycust[['CustomerId','Customer','NewCustomer','AllottedDeliveryDays','Deliveries',
                               'OffDayDeliveries','AdditionalDeliveries','Cases','Dollars']]
    
    
    print('Mapping useful Customer attributes.')
    attr = ['CustomerId','Warehouse','OnPremise','CustomerSetup','CustomerType','ShipWeekPlan','DeliveryDays']
    customer_attributes = deliveries[attr].drop_duplicates().reset_index(drop=True)
    
    _agg_bycust = _agg_bycust.merge(customer_attributes, on='CustomerId', how='inner').drop_duplicates()
    _agg_bycust = _agg_bycust.sort_values(by=['AdditionalDeliveries','OffDayDeliveries'], ascending=False).reset_index(drop=True)
    
    _agg_bycust['CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries']
    _agg_bycust['DollarsPerDelivery'] = round(_agg_bycust['Dollars'] / _agg_bycust['Deliveries'],2)
    
    _agg_bycust['OffDayDeliveries/Deliveries'] = round(_agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'],2)
    _agg_bycust['AdditionalDeliveries/Deliveries'] = round(_agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'],2)
    
    
    print('Mapping Tiers based on allotted delivery days.')
    tier_map = {0:'No Delivery Days Assigned',0.5:'Tier 4', 1:'Tier 3', 2:'Tier 2', 3:'Tier 1', 4:'Tier 1', 5:'Tier 1', 6:'Tier 1', 7:'Tier 1'}
    _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map)
    
    addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist()
    tier = _agg_bycust['Tier'].tolist()
    
    _agg_bycust['AdditionalDeliveries'] = [addl if t != 'No Delivery Days Assigned' else 0 for addl, t in zip(addl_deliv, tier)]
    
    _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(np.nan, '')
    
    
    print('Creating Overall Summary.')
    agg_funcs_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    overall_summary = DataFrame(_agg_bycust.groupby(['Tier','Warehouse']).agg(agg_funcs_summary))
    overall_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in overall_summary.columns]
    overall_summary = overall_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    overall_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
    
    print('Creating High-Level Summary.\n\n\n')
    agg_funcs_HL_summary = {'Deliveries':sum,
                         'OffDayDeliveries':sum,
                         'AdditionalDeliveries':sum,
                         'Dollars':{'Avg':np.mean},
                         'Cases':{'Avg':np.mean},
                         'CasesPerDelivery':{'Avg':np.mean},
                         'NewCustomer':sum,
                         'Customer':len,
                         'AllottedDeliveryDays':lambda x: round(np.mean(x),1)}                                           
    
    high_level_summary = DataFrame(_agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary))
    high_level_summary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in high_level_summary.columns]
    high_level_summary = high_level_summary[['NewCustomer|sum','Customer|len','AllottedDeliveryDays|<lambda>',
                                       'Deliveries|sum','OffDayDeliveries|sum','AdditionalDeliveries|sum',
                                       'Cases|Avg','CasesPerDelivery|Avg','Dollars|Avg']]
    high_level_summary.columns = ['NewCustomers','Customers','AvgAllottedDeliveryDays','Deliveries','OffDayDeliveries','AdditionalDeliveries',
                                       'Cases|mean','CasesPerDelivery|mean','Dollars|mean']
                                       
    print('*'*100)
    print('Finished creating summaries at high level, overall, and aggregating by customer and by day.')
    print('*'*100)    

    return high_level_summary, overall_summary, _agg_bycust, _agg_byday, deliveries
Beispiel #18
0
df.sort(columns=['one','two'], ascending=[0,1])

prices = [101.0,102.0,103.0]
tickers = ['GOOG','AAPL']
data = [v for v in itertools.product(tickers,prices)]
dates = pandas.date_range('2013-01-03',periods=3)
df = DataFrame(data, columns=['ticker','price'])
df['dates'] = dates.append(dates)
df
df.pivot(index='dates',columns='ticker',values='price')

original = DataFrame([[1,1],[2,2],[3.0,3]],index=['a','b','c'], columns=['one','two'])
original.reindex(index=['b','c','d'])
different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two'])
original.reindex_like(different)
original.reindex_axis(['two','one'], axis = 1)

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three'])
left.merge(right,on='one') # Same as how='inner'
left.merge(right,on='one', how='left')
left.merge(right,on='one', how='right')
left.merge(right,on='one', how='outer')

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
left
right = DataFrame([[nan,12],[13,nan],[nan,8]],columns=['one','two'],index=[1,2,3])
right
left.update(right) # Updates values in left
left
data = [v for v in itertools.product(tickers, prices)]
dates = pandas.date_range('2013-01-03', periods=3)
df = DataFrame(data, columns=['ticker', 'price'])
df['dates'] = dates.append(dates)
df
df.pivot(index='dates', columns='ticker', values='price')

original = DataFrame([[1, 1], [2, 2], [3.0, 3]],
                     index=['a', 'b', 'c'],
                     columns=['one', 'two'])
original.reindex(index=['b', 'c', 'd'])
different = DataFrame([[1, 1], [2, 2], [3.0, 3]],
                      index=['c', 'd', 'e'],
                      columns=['one', 'two'])
original.reindex_like(different)
original.reindex_axis(['two', 'one'], axis=1)

left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=['one', 'two'])
right = DataFrame([[1, 2], [3, 4], [7, 8]], columns=['one', 'three'])
left.merge(right, on='one')  # Same as how='inner'
left.merge(right, on='one', how='left')
left.merge(right, on='one', how='right')
left.merge(right, on='one', how='outer')

left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=['one', 'two'])
left
right = DataFrame([[nan, 12], [13, nan], [nan, 8]],
                  columns=['one', 'two'],
                  index=[1, 2, 3])
right
left.update(right)  # Updates values in left
def clean_pw_offday(pw_offday, weeklookup):
    '''
    Clean pw_offday query without filtering out non-off-days
    invoice-level => day level => customer level
    '''
    print('*' * 100)
    print('Cleaning pw_offday query and creating summaries.')
    print('*' * 100)
    deliveries = pw_offday

    print('\n\n\nDeclaring functions for later use.')

    def as400_date(dat):
        '''Accepts date as formatted in AS400'''
        dat = str(dat)
        dat = dat[-6:]
        dat = dt.date(dt.strptime(dat, '%y%m%d'))
        return dat

    def sum_digits_in_string(digit):
        return sum(int(x) for x in digit if x.isdigit())

    print('Mapping Columns.')
    deliveries.columns = [
        'Date', 'Division', 'Invoice', 'CustomerId', 'Call', 'Priority',
        'Warehouse', 'Cases', 'Dollars', 'Ship', 'Salesperson', 'ShipWeekPlan',
        'Merchandising', 'OnPremise', 'CustomerSetup', 'CustomerType',
        'Customer'
    ]

    print('Mapping Customer types.')
    typ_map = {'A':'Bar/Tavern','C':'Country Club','E':'Transportation/Airline','G':'Gambling',\
                'J':'Hotel/Motel','L':'Restaurant','M':'Military','N':'Fine Dining','O':'Internal',\
                'P':'Country/Western','S':'Package Store','T':'Supermarket/Grocery','V':'Drug Store',\
                'Y':'Convenience Store','Z':'Catering','3':'Night Club','5':'Adult Entertainment','6':'Sports Bar',\
                'I':'Church','F':'Membership Club','B':'Mass Merchandiser','H':'Fraternal Organization',\
                '7':'Sports Venue'}
    deliveries.CustomerType = deliveries.CustomerType.astype(str).map(typ_map)

    print('Mapping Warehouse names.')
    whs_map = {
        1: 'Kansas City',
        2: 'Saint Louis',
        3: 'Columbia',
        4: 'Cape Girardeau',
        5: 'Springfield'
    }
    deliveries.Warehouse = deliveries.Warehouse.map(whs_map)

    print('Processing dates.')
    deliveries.Date = [
        as400_date(d) for d in deliveries.Date.astype(str).tolist()
    ]
    weeklookup['Date'] = [
        dt.date(dt.strptime(w_Dat, '%m/%d/%Y'))
        for w_Dat in weeklookup['Date'].astype(str).tolist()
    ]

    print('Merging on dates with week lookup.')
    deliveries = deliveries.merge(weeklookup, on='Date')

    dat = Series(deliveries.Date.tolist())
    deliveries['Weekday'] = Series([dt.strftime(d, '%A') for d in dat])

    week_plan = deliveries.ShipWeekPlan.tolist()
    week_shipped = deliveries.ShipWeek.tolist()

    print('Using custom logic to derive which days were off-day deliveries.')
    deliveries.Ship = del_days = [
        str('%07d' % int(str(day).zfill(0)))
        for day in deliveries.Ship.astype(str).tolist()
    ]

    mon = Series([d[-7:][:1] for d in del_days]).map({'1': 'M', '0': '_'})
    tue = Series([d[-6:][:1] for d in del_days]).map({'1': 'T', '0': '_'})
    wed = Series([d[-5:][:1] for d in del_days]).map({'1': 'W', '0': '_'})
    thu = Series([d[-4:][:1] for d in del_days]).map({'1': 'R', '0': '_'})
    fri = Series([d[-3:][:1] for d in del_days]).map({'1': 'F', '0': '_'})
    sat = Series([d[-2:][:1] for d in del_days]).map({'1': 'S', '0': '_'})
    sun = Series([d[-1:][:1] for d in del_days]).map({'1': 'U', '0': '_'})

    deliveries['DeliveryDays'] = del_days = list(
        itertools.chain.from_iterable(
            [mon + tue + wed + thu + fri + sat + sun]))

    weekday = deliveries.Weekday = [
        d[:3] for d in deliveries.Weekday.astype(str).tolist()
    ]
    _days = DataFrame(
        data={
            'Weekday': weekday,
            'WeekPlanned': week_plan,
            'WeekShipped': week_shipped,
            'DelDays': del_days
        }
    )  #'Monday':mon, 'Tuesday':tue, 'Wednesday':wed, 'Thursday':thu, 'Friday':fri, 'Saturday':sat, 'Sunday':sun,
    day_list = _days['WeekPlanned'].tolist()
    _days['WeekPlanned'] = [d if d in ['A', 'B'] else '' for d in day_list]

    _week_actual = _days.WeekShipped.tolist()
    _week_plan = _days['WeekPlanned'] = [
        ship_week if plan_week == '' else plan_week for ship_week, plan_week in
        zip(_week_actual, _days.WeekPlanned.tolist())
    ]
    _days['OffWeek'] = _off_week = [
        p != a for p, a in zip(_week_plan, _week_actual)
    ]

    off_mon = [
        str('M' not in d and w == 'Mon')[:1]
        for d, w in zip(del_days, weekday)
    ]
    off_tue = [
        str('T' not in d and w == 'Tue')[:1]
        for d, w in zip(del_days, weekday)
    ]
    off_wed = [
        str('W' not in d and w == 'Wed')[:1]
        for d, w in zip(del_days, weekday)
    ]
    off_thu = [
        str('R' not in d and w == 'Thu')[:1]
        for d, w in zip(del_days, weekday)
    ]
    off_fri = [
        str('F' not in d and w == 'Fri')[:1]
        for d, w in zip(del_days, weekday)
    ]
    off_sat = [
        str('S' not in d and w == 'Sat')[:1]
        for d, w in zip(del_days, weekday)
    ]
    off_sun = [
        str('U' not in d and w == 'Sun')[:1]
        for d, w in zip(del_days, weekday)
    ]

    _off_days = DataFrame({
        'Mon': off_mon,
        'Tue': off_tue,
        'Wed': off_wed,
        'Thu': off_thu,
        'Fri': off_fri,
        'Sat': off_sat,
        'Sun': off_sun,
        'OffWeek': _off_week,
        'Weekday': weekday
    })
    _off_days = _off_days[[
        'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Weekday', 'OffWeek'
    ]]
    _off_days['OffDayDelivery'] = (_off_days['Mon'] == 'T') | (
        _off_days['Tue']
        == 'T') | (_off_days['Wed'] == 'T') | (_off_days['Thu'] == 'T') | (
            _off_days['Fri'] == 'T') | (_off_days['Sat'] == 'T') | (
                _off_days['Sun'] == 'T') | (_off_days['OffWeek'] == True)

    print('Check here if you suspect a bug.')
    #check_later = _off_days[_off_days['OffDayDelivery'] == True]

    print('Mapping Call Codes.')
    deliveries = pd.concat(
        [deliveries, _off_days[['OffWeek', 'OffDayDelivery']]], axis=1)
    deliveries.Call = deliveries.Call.map({
        1: 'Customer Call',
        2: 'ROE/EDI',
        3: 'Salesperson Call',
        4: 'Telesales'
    })

    print('Putting Setup Date into proper date format.')
    setup_date = deliveries.CustomerSetup.astype(str).tolist()
    setup_month = Series([d.zfill(4)[:2] for d in setup_date])
    setup_year = Series([
        "20" + s[-2:] if int(s[-2:]) < 20 else "19" + s[-2:]
        for s in setup_date
    ])  #this_century = [int(d[-2:]) < 20 for d in setup_date]

    deliveries['CustomerSetup'] = c_setup = [
        str(mon) + '-' + str(yr) for mon, yr in zip(setup_month, setup_year)
    ]

    print(
        'Defining new customers based on whether they were setup last month or not.'
    )
    if dt.now().month == 1:
        last_month = '12'
    else:
        last_month = str(dt.now().month - 1).zfill(2)
    if dt.now().month == 1:
        this_year = str(dt.now().year - 1)
    else:
        this_year = str(dt.now().year)
    m_y_cutoff = last_month + '-' + this_year

    deliveries['NewCustomer'] = [
        1 if m_y_cutoff == setup else 0 for setup in c_setup
    ]
    deliveries['OffDayDeliveries'] = deliveries.OffDayDelivery.astype(int)

    print('Deriving number of weekly deliveries allotted to each customer.')
    _n_days = deliveries.Ship.astype(str).tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [
        sum_digits_in_string(n) for n in _n_days
    ]
    _allot = deliveries['AllottedWeeklyDeliveryDays'].tolist()
    _week_ind = deliveries['ShipWeekPlan'].tolist()
    deliveries['AllottedWeeklyDeliveryDays'] = [
        a if w not in ['A', 'B'] else 0.5 for a, w in zip(_allot, _week_ind)
    ]
    _n_days = deliveries.set_index(
        'CustomerId')['AllottedWeeklyDeliveryDays'].to_dict()

    print('\n')
    print('-' * 100)
    print('\n')

    print('Aggregating by Day.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_day = {
        'OffDayDeliveries': {
            'Count': max
        },
        'Date': {
            'Count': len_unique
        },
        'Cases': {
            'Sum': sum,
            'Avg': np.mean
        },
        'Dollars': {
            'Sum': sum,
            'Avg': np.mean
        },
        'NewCustomer': lambda x: min(x)
    }

    pass_through_cols = ['CustomerId', 'Customer', 'Week', 'Date']
    _agg_byday = DataFrame(
        deliveries.groupby(pass_through_cols).agg(agg_funcs_day)).reset_index(
            drop=False)
    _agg_byday = DataFrame(_agg_byday[[
        'CustomerId', 'Customer', 'Week', 'Date', 'OffDayDeliveries',
        'NewCustomer', 'Cases', 'Dollars'
    ]])
    _agg_byday.columns = [
        '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byday.columns
    ]
    _agg_byday.columns = [
        'CustomerId', 'Customer', 'Week', 'Date', 'Delivery', 'OffDayDelivery',
        'NewCustomer', 'Cases|Sum', 'Cases|Avg', 'Dollars|Sum', 'Dollars|Avg'
    ]
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday[
        'CustomerId'].astype(int)
    _agg_byday['AllottedWeeklyDeliveryDays|Count'] = _agg_byday[
        'AllottedWeeklyDeliveryDays|Count'].map(_n_days)

    print('Aggregating by Week.')
    agg_funcs_week = {
        'OffDayDelivery': {
            'Count': sum
        },
        'Delivery': {
            'Count': sum
        },
        'NewCustomer': lambda x: min(x)
    }

    _agg_byweek = DataFrame(
        _agg_byday.groupby(['CustomerId', 'Week'
                            ]).agg(agg_funcs_week)).reset_index(drop=False)
    _agg_byweek.columns = [
        '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byweek.columns
    ]

    print('Mapping number of deliveries to Customers.')
    # Map number of total deliveries each week by customer
    # to determine whether a customer with TWR deliveries
    # got TWF deliveries -- which is an off-day delivery
    # but not an additional delivery. Use a dictionary {(cust#, week) : n_deliveries_total}
    _c = _agg_byweek['CustomerId'].astype(str).tolist()
    _w = _agg_byweek['Week'].astype(str).tolist()
    _agg_byweek['_X'] = [c + ',' + w for c, w in zip(_c, _w)]
    by_week_map = _agg_byweek.set_index('_X')['Delivery|Count'].to_dict()

    cid = _agg_byday['CustomerId'].astype(str).tolist()
    wkk = _agg_byday['Week'].astype(str).tolist()
    _agg_byday['N_DeliveriesThisWeek'] = [
        c + ',' + w for c, w in zip(cid, wkk)
    ]
    _agg_byday['N_DeliveriesThisWeek'] = _agg_byday[
        'N_DeliveriesThisWeek'].map(Series(by_week_map))

    print('Using custom logic to define Additional Delivery Days.')
    addl_day_criteria_1 = (
        _agg_byday.shift(1)['CustomerId'] == _agg_byday['CustomerId'])
    addl_day_criteria_2 = (_agg_byday.shift(1)['Week'] == _agg_byday['Week'])
    addl_day_criteria_3 = (_agg_byday['OffDayDelivery'] == 1)
    addl_day_criteria_4 = (_agg_byday['NewCustomer'] != 1)
    addl_day_criteria_5 = (_agg_byday['N_DeliveriesThisWeek'] >
                           _agg_byday['AllottedWeeklyDeliveryDays|Count'])

    _agg_byday['AdditionalDeliveryDays'] = Series(
        addl_day_criteria_1 & addl_day_criteria_2 & addl_day_criteria_3
        & addl_day_criteria_4 & addl_day_criteria_5).astype(int)

    print('Aggregating by Customer.')
    agg_funcs_cust = {
        'OffDayDelivery': {
            'Count': sum
        },
        'Delivery': {
            'Count': sum
        },
        'NewCustomer': lambda x: min(x),
        'AllottedWeeklyDeliveryDays|Count': lambda x: max(x),
        'AdditionalDeliveryDays': lambda x: sum(x),
        'Dollars|Sum': lambda x: int(sum(x)),
        'Cases|Sum': lambda x: sum(x)
    }

    _agg_bycust = DataFrame(
        _agg_byday.groupby(['CustomerId', 'Customer'
                            ]).agg(agg_funcs_cust)).reset_index(drop=False)
    _agg_bycust.columns = [
        '%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_bycust.columns
    ]
    _agg_bycust = _agg_bycust.reindex_axis(sorted(_agg_bycust.columns), axis=1)

    _agg_bycust.columns = [
        'AdditionalDeliveries', 'AllottedDeliveryDays', 'Cases', 'Customer',
        'CustomerId', 'Deliveries', 'Dollars', 'NewCustomer',
        'OffDayDeliveries'
    ]
    _agg_bycust = _agg_bycust[[
        'CustomerId', 'Customer', 'NewCustomer', 'AllottedDeliveryDays',
        'Deliveries', 'OffDayDeliveries', 'AdditionalDeliveries', 'Cases',
        'Dollars'
    ]]

    print('Mapping useful Customer attributes.')
    attr = [
        'CustomerId', 'Warehouse', 'OnPremise', 'CustomerSetup',
        'CustomerType', 'ShipWeekPlan', 'DeliveryDays'
    ]
    customer_attributes = deliveries[attr].drop_duplicates().reset_index(
        drop=True)

    _agg_bycust = _agg_bycust.merge(customer_attributes,
                                    on='CustomerId',
                                    how='inner').drop_duplicates()
    _agg_bycust = _agg_bycust.sort_values(
        by=['AdditionalDeliveries',
            'OffDayDeliveries'], ascending=False).reset_index(drop=True)

    _agg_bycust[
        'CasesPerDelivery'] = _agg_bycust['Cases'] / _agg_bycust['Deliveries']
    _agg_bycust['DollarsPerDelivery'] = round(
        _agg_bycust['Dollars'] / _agg_bycust['Deliveries'], 2)

    _agg_bycust['OffDayDeliveries/Deliveries'] = round(
        _agg_bycust['OffDayDeliveries'] / _agg_bycust['Deliveries'], 2)
    _agg_bycust['AdditionalDeliveries/Deliveries'] = round(
        _agg_bycust['AdditionalDeliveries'] / _agg_bycust['Deliveries'], 2)

    print('Mapping Tiers based on allotted delivery days.')
    tier_map = {
        0: 'No Delivery Days Assigned',
        0.5: 'Tier 4',
        1: 'Tier 3',
        2: 'Tier 2',
        3: 'Tier 1',
        4: 'Tier 1',
        5: 'Tier 1',
        6: 'Tier 1',
        7: 'Tier 1'
    }
    _agg_bycust['Tier'] = _agg_bycust['AllottedDeliveryDays'].map(tier_map)

    addl_deliv = _agg_bycust['AdditionalDeliveries'].tolist()
    tier = _agg_bycust['Tier'].tolist()

    _agg_bycust['AdditionalDeliveries'] = [
        addl if t != 'No Delivery Days Assigned' else 0
        for addl, t in zip(addl_deliv, tier)
    ]

    _agg_bycust['ShipWeekPlan'] = _agg_bycust['ShipWeekPlan'].replace(
        np.nan, '')

    print('Creating Overall Summary.')
    agg_funcs_summary = {
        'Deliveries': sum,
        'OffDayDeliveries': sum,
        'AdditionalDeliveries': sum,
        'Dollars': {
            'Avg': np.mean
        },
        'Cases': {
            'Avg': np.mean
        },
        'CasesPerDelivery': {
            'Avg': np.mean
        },
        'NewCustomer': sum,
        'Customer': len,
        'AllottedDeliveryDays': lambda x: round(np.mean(x), 1)
    }

    overall_summary = DataFrame(
        _agg_bycust.groupby(['Tier', 'Warehouse']).agg(agg_funcs_summary))
    overall_summary.columns = [
        '%s%s' % (a, '|%s' % b if b else '')
        for a, b in overall_summary.columns
    ]
    overall_summary = overall_summary[[
        'NewCustomer|sum', 'Customer|len', 'AllottedDeliveryDays|<lambda>',
        'Deliveries|sum', 'OffDayDeliveries|sum', 'AdditionalDeliveries|sum',
        'Cases|Avg', 'CasesPerDelivery|Avg', 'Dollars|Avg'
    ]]
    overall_summary.columns = [
        'NewCustomers', 'Customers', 'AvgAllottedDeliveryDays', 'Deliveries',
        'OffDayDeliveries', 'AdditionalDeliveries', 'Cases|mean',
        'CasesPerDelivery|mean', 'Dollars|mean'
    ]

    print('Creating High-Level Summary.\n\n\n')
    agg_funcs_HL_summary = {
        'Deliveries': sum,
        'OffDayDeliveries': sum,
        'AdditionalDeliveries': sum,
        'Dollars': {
            'Avg': np.mean
        },
        'Cases': {
            'Avg': np.mean
        },
        'CasesPerDelivery': {
            'Avg': np.mean
        },
        'NewCustomer': sum,
        'Customer': len,
        'AllottedDeliveryDays': lambda x: round(np.mean(x), 1)
    }

    high_level_summary = DataFrame(
        _agg_bycust.groupby(['Tier']).agg(agg_funcs_HL_summary))
    high_level_summary.columns = [
        '%s%s' % (a, '|%s' % b if b else '')
        for a, b in high_level_summary.columns
    ]
    high_level_summary = high_level_summary[[
        'NewCustomer|sum', 'Customer|len', 'AllottedDeliveryDays|<lambda>',
        'Deliveries|sum', 'OffDayDeliveries|sum', 'AdditionalDeliveries|sum',
        'Cases|Avg', 'CasesPerDelivery|Avg', 'Dollars|Avg'
    ]]
    high_level_summary.columns = [
        'NewCustomers', 'Customers', 'AvgAllottedDeliveryDays', 'Deliveries',
        'OffDayDeliveries', 'AdditionalDeliveries', 'Cases|mean',
        'CasesPerDelivery|mean', 'Dollars|mean'
    ]

    print('*' * 100)
    print(
        'Finished creating summaries at high level, overall, and aggregating by customer and by day.'
    )
    print('*' * 100)

    return high_level_summary, overall_summary, _agg_bycust, _agg_byday
df.sort(columns=["one", "two"], ascending=[0, 1])

prices = [101.0, 102.0, 103.0]
tickers = ["GOOG", "AAPL"]
data = [v for v in itertools.product(tickers, prices)]
dates = pandas.date_range("2013-01-03", periods=3)
df = DataFrame(data, columns=["ticker", "price"])
df["dates"] = dates.append(dates)
df
df.pivot(index="dates", columns="ticker", values="price")

original = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["a", "b", "c"], columns=["one", "two"])
original.reindex(index=["b", "c", "d"])
different = DataFrame([[1, 1], [2, 2], [3.0, 3]], index=["c", "d", "e"], columns=["one", "two"])
original.reindex_like(different)
original.reindex_axis(["two", "one"], axis=1)

left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"])
right = DataFrame([[1, 2], [3, 4], [7, 8]], columns=["one", "three"])
left.merge(right, on="one")  # Same as how='inner'
left.merge(right, on="one", how="left")
left.merge(right, on="one", how="right")
left.merge(right, on="one", how="outer")

left = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["one", "two"])
left
right = DataFrame([[nan, 12], [13, nan], [nan, 8]], columns=["one", "two"], index=[1, 2, 3])
right
left.update(right)  # Updates values in left
left
def aggregate_unsaleables_by_product(pwunsale_tidy, pwrct1_tidy, pw_ytdprod,
                                     pw_ytdsupp):
    '''
    Aggregates unsaleables returns & dumps by product.
    
    Takes tidy data as input (previous function).
    '''
    pwunsale = pwunsale_tidy
    pwrct1 = pwrct1_tidy

    print('Expect to see the following. \n\n\n')
    tot_unsaleable = np.sum(pwrct1_tidy['ExtCost'])
    returned = np.sum(pwunsale_tidy['ExtCost'])

    print('Total unsaleables expected:  $%.2f' % tot_unsaleable)
    print('Total returns expected:  $%.2f' % returned)

    print('\n\n\nAggregating RCT1 data by Product.')
    agg_funcs_product_rct = {
        'CasesUnsaleable': {
            'avg': np.mean,
            'sum': np.sum
        },
        'ExtCost': {
            'avg': np.mean,
            'sum': np.sum
        }
    }

    grp_cols = ['SupplierId', 'Supplier', 'ProductId', 'Product']
    _agg_byproduct_rct = DataFrame(
        pwrct1.groupby(grp_cols).agg(agg_funcs_product_rct).reset_index(
            drop=False))
    _agg_byproduct_rct.columns = [
        '%s%s' % (a, '|%s' % b if b else '')
        for a, b in _agg_byproduct_rct.columns
    ]
    _agg_byproduct_rct = _agg_byproduct_rct.reindex_axis(sorted(
        _agg_byproduct_rct.columns),
                                                         axis=1)
    _agg_byproduct_rct.columns = [
        'CasesUnsaleable|avg', 'CasesUnsaleable|sum', 'DollarsUnsaleable|avg',
        'DollarsUnsaleable|sum', 'Product', 'ProductId', 'Supplier',
        'SupplierId'
    ]

    print('\nUpdated unsaleables: $%.2f \n' %
          np.sum(_agg_byproduct_rct['DollarsUnsaleable|sum']))

    print('Aggregating MTC data by Product.')
    agg_funcs_product_mtc = {
        'CasesReturned': {
            'avg': np.mean,
            'sum': np.sum
        },
        'ExtCost': {
            'avg': np.mean,
            'sum': np.sum
        }
    }

    _agg_byproduct_mtc = DataFrame(
        pwunsale.groupby(grp_cols).agg(agg_funcs_product_mtc).reset_index(
            drop=False))
    _agg_byproduct_mtc.columns = [
        '%s%s' % (a, '|%s' % b if b else '')
        for a, b in _agg_byproduct_mtc.columns
    ]
    _agg_byproduct_mtc = _agg_byproduct_mtc.reindex_axis(sorted(
        _agg_byproduct_mtc.columns),
                                                         axis=1)
    _agg_byproduct_mtc.columns = [
        'CasesReturned|avg', 'CasesReturned|sum', 'DollarsReturned|avg',
        'DollarsReturned|sum', 'Product', 'ProductId', 'Supplier', 'SupplierId'
    ]

    print('\nUpdated returns: $%.2f \n' %
          np.sum(_agg_byproduct_mtc['DollarsReturned|sum']))

    print('Combining RCT and MTC data.')
    _agg_byproduct_combined = _agg_byproduct_rct.merge(
        _agg_byproduct_mtc.drop(labels=['Supplier', 'Product'], axis=1),
        on=['SupplierId', 'ProductId'],
        how='outer')
    _agg_byproduct_combined[['ProductId',
                             'SupplierId']] = _agg_byproduct_combined[[
                                 'ProductId', 'SupplierId'
                             ]].astype(np.int)

    print('Merging in Directors on the SupplierId field.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(
        directors[['SupplierId', 'Director']], on='SupplierId', how='left')

    print('\nUpdated Unsaleables: $%.2f' %
          np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']))
    print('Updated Returns: $%.2f \n' %
          np.sum(_agg_byproduct_combined['DollarsReturned|sum']))

    print('Reordering columns.')
    reorder_cols = [
        'Director', 'SupplierId', 'Supplier', 'ProductId', 'Product',
        'DollarsUnsaleable|sum', 'CasesUnsaleable|sum',
        'DollarsUnsaleable|avg', 'CasesUnsaleable|avg', 'DollarsReturned|sum',
        'CasesReturned|sum', 'DollarsReturned|avg', 'CasesReturned|avg'
    ]
    _agg_byproduct_combined = _agg_byproduct_combined[reorder_cols]

    print('Mapping in attribute columns.')
    _attrs = ['ProductId', 'Size', 'Class', 'QPC']
    _attributes = pwrct1[_attrs].drop_duplicates(subset='ProductId')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(_attributes,
                                                            on='ProductId',
                                                            how='left')

    print('Mapping in YTD sales by Product.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdprod,
                                                            on='ProductId',
                                                            how='left')

    print('Mapping in YTD sales by Supplier.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdsupp,
                                                            on='SupplierId',
                                                            how='left')

    print('Deriving percenteage of sales by Product.')
    _agg_byproduct_combined['PercentSales|byproduct'] = np.divide(
        _agg_byproduct_combined['DollarsUnsaleable|sum'],
        _agg_byproduct_combined['DollarSales|byproduct'])

    print('Deriving percenteage of sales by Suppplier.')
    _agg_byproduct_combined['PercentSales|bysupplier'] = np.divide(
        _agg_byproduct_combined['DollarsUnsaleable|sum'],
        _agg_byproduct_combined['DollarSales|bysupplier'])

    print('\nUpdated Unsaleables: $%.2f' %
          np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']))
    print('Updated Returns: $%.2f \n' %
          np.sum(_agg_byproduct_combined['DollarsReturned|sum']))

    print('Checking for and dropping Duplicates.')
    _agg_byproduct_combined.drop_duplicates(inplace=True)

    print('Replacing NaN values with zeros for readability.')
    _agg_byproduct_combined.fillna(0, inplace=True)

    print('Sorting in descending order on total unsaleables.\n\n\n')
    _agg_byproduct_combined.sort_values('DollarsUnsaleable|sum',
                                        ascending=False,
                                        inplace=True)

    print('Resetting index.')
    _agg_byproduct_combined.reset_index(inplace=True, drop=True)

    print('Compare values below to originals. \n\n\n')
    new_tot_unsaleable = np.sum(
        _agg_byproduct_combined['DollarsUnsaleable|sum'])
    new_returned = np.sum(_agg_byproduct_combined['DollarsReturned|sum'])

    print(
        'Original Unsaleables:  $%.2f \nPost-Processing Unsaleables:  $%.2f \n'
        % (tot_unsaleable, new_tot_unsaleable))
    print('Original Returns:  $%.2f \nPost-Processing Returns:  $%.2f \n\n\n' %
          (returned, new_returned))

    print('*' * 100)
    print(
        'If the numbers above do not match then there is a bug in the program.'
    )
    print('*' * 100)

    return _agg_byproduct_combined
def aggregate_unsaleables_by_product(pwunsale_tidy, pwrct1_tidy, pw_ytdprod, pw_ytdsupp):
    '''
    Aggregates unsaleables returns & dumps by product.
    
    Takes tidy data as input (previous function).
    '''
    pwunsale = pwunsale_tidy
    pwrct1 = pwrct1_tidy    
    
    print('Expect to see the following. \n\n\n')
    tot_unsaleable = np.sum(pwrct1_tidy['ExtCost'])
    returned = np.sum(pwunsale_tidy['ExtCost'])
    
    print('Total unsaleables expected:  $%.2f' % tot_unsaleable) 
    print('Total returns expected:  $%.2f' % returned)
    
    print('\n\n\nAggregating RCT1 data by Product.')
    agg_funcs_product_rct = {'CasesUnsaleable': {'avg':np.mean, 'sum':np.sum},
                             'ExtCost': {'avg':np.mean, 'sum':np.sum}}
    
    grp_cols = ['SupplierId','Supplier','ProductId','Product']
    _agg_byproduct_rct = DataFrame(pwrct1.groupby(grp_cols).agg(agg_funcs_product_rct).reset_index(drop=False))
    _agg_byproduct_rct.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_rct.columns]                  
    _agg_byproduct_rct = _agg_byproduct_rct.reindex_axis(sorted(_agg_byproduct_rct.columns), axis=1)
    _agg_byproduct_rct.columns = ['CasesUnsaleable|avg', 'CasesUnsaleable|sum', 
                                  'DollarsUnsaleable|avg', 'DollarsUnsaleable|sum',
                                  'Product', 'ProductId', 
                                  'Supplier', 'SupplierId']
                                  
    print('\nUpdated unsaleables: $%.2f \n' % np.sum(_agg_byproduct_rct['DollarsUnsaleable|sum']))
    
    print('Aggregating MTC data by Product.')
    agg_funcs_product_mtc = {'CasesReturned': {'avg':np.mean, 'sum':np.sum},
                         'ExtCost': {'avg':np.mean, 'sum':np.sum}}
    
    _agg_byproduct_mtc = DataFrame(pwunsale.groupby(grp_cols).agg(agg_funcs_product_mtc).reset_index(drop=False))
    _agg_byproduct_mtc.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in _agg_byproduct_mtc.columns]                  
    _agg_byproduct_mtc = _agg_byproduct_mtc.reindex_axis(sorted(_agg_byproduct_mtc.columns), axis=1)
    _agg_byproduct_mtc.columns = ['CasesReturned|avg', 'CasesReturned|sum', 
                                  'DollarsReturned|avg', 'DollarsReturned|sum',
                                  'Product', 'ProductId', 
                                  'Supplier', 'SupplierId']
                                  
    print('\nUpdated returns: $%.2f \n' % np.sum(_agg_byproduct_mtc['DollarsReturned|sum']))
    
    print('Combining RCT and MTC data.')
    _agg_byproduct_combined = _agg_byproduct_rct.merge(_agg_byproduct_mtc.drop(labels=['Supplier','Product'], axis=1), on=['SupplierId','ProductId'], how='outer')
    _agg_byproduct_combined[['ProductId','SupplierId']] = _agg_byproduct_combined[['ProductId','SupplierId']].astype(np.int)
    
    print('Merging in Directors on the SupplierId field.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(directors[['SupplierId','Director']], on='SupplierId',how='left')
    
    print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']))
    print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum']))
    
    print('Reordering columns.')
    reorder_cols = ['Director', 'SupplierId', 'Supplier', 
                    'ProductId', 'Product',
                    'DollarsUnsaleable|sum', 'CasesUnsaleable|sum',
                    'DollarsUnsaleable|avg', 'CasesUnsaleable|avg',
                    'DollarsReturned|sum', 'CasesReturned|sum',
                    'DollarsReturned|avg', 'CasesReturned|avg']
    _agg_byproduct_combined = _agg_byproduct_combined[reorder_cols]
    
    print('Mapping in attribute columns.')
    _attrs = ['ProductId', 'Size', 'Class', 'QPC']
    _attributes = pwrct1[_attrs].drop_duplicates(subset='ProductId')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(_attributes, on='ProductId', how='left')
    
    print('Mapping in YTD sales by Product.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdprod, on='ProductId', how='left')
    
    print('Mapping in YTD sales by Supplier.')
    _agg_byproduct_combined = _agg_byproduct_combined.merge(pw_ytdsupp, on='SupplierId', how='left')
    
    print('Deriving percenteage of sales by Product.')
    _agg_byproduct_combined['PercentSales|byproduct'] = np.divide(_agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|byproduct'])
    
    print('Deriving percenteage of sales by Suppplier.')
    _agg_byproduct_combined['PercentSales|bysupplier'] = np.divide(_agg_byproduct_combined['DollarsUnsaleable|sum'], _agg_byproduct_combined['DollarSales|bysupplier'])    
        
    print('\nUpdated Unsaleables: $%.2f' % np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum']))
    print('Updated Returns: $%.2f \n' % np.sum(_agg_byproduct_combined['DollarsReturned|sum']))
    
    print('Checking for and dropping Duplicates.')
    _agg_byproduct_combined.drop_duplicates(inplace=True)
    
    print('Replacing NaN values with zeros for readability.')
    _agg_byproduct_combined.fillna(0, inplace=True)
    
    print('Sorting in descending order on total unsaleables.\n\n\n')
    _agg_byproduct_combined.sort_values('DollarsUnsaleable|sum', ascending=False, inplace=True)

    print('Resetting index.')
    _agg_byproduct_combined.reset_index(inplace=True, drop=True)    
    
    print('Compare values below to originals. \n\n\n')
    new_tot_unsaleable = np.sum(_agg_byproduct_combined['DollarsUnsaleable|sum'])
    new_returned = np.sum(_agg_byproduct_combined['DollarsReturned|sum'])
    
    print('Original Unsaleables:  $%.2f \nPost-Processing Unsaleables:  $%.2f \n' % (tot_unsaleable, new_tot_unsaleable)) 
    print('Original Returns:  $%.2f \nPost-Processing Returns:  $%.2f \n\n\n' % (returned, new_returned)) 
    
    print('*'*100)
    print('If the numbers above do not match then there is a bug in the program.')
    print('*'*100)
    
    return _agg_byproduct_combined