Beispiel #1
0
    skel['ldate'] = df.ldate
if cf.add_lname_col:
    skel['lname'] = df.lname
if cf.add_line_col:
    skel['line'] = df.line
if cf.add_twa_col:
    skel['twa'] = df.twa

if not cf.actives_only:
    skel['fur'] = df.fur

# SCALE*

if cf.compute_pay_measures:

    df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']))
    skel['s_lyears'] = df.s_lyears

    month_inc = (1 / 12)

    # scale is payrate longevity level
    # compute scale for each employee for each month
    # begin with s_lyears (starting longevity years)
    # add a monthly increment based on the month number (mnum)
    # convert to an integer which rounds toward zero
    # clip to min of 1 and max of top_of_scale (max pay longevity scale)
    skel['scale'] = np.clip(
        ((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1,
        cf.top_of_scale)
    skel.pop('s_lyears')
def main():

    # read prepared list dataframe - proper column headers, column formats...
    # this is master.pkl, order-independent, concatenated list data
    pre, suf = 'dill/', '.pkl'
    master_list = 'master'
    master_path = (pre + master_list + suf)

    try:
        df_mlist = pd.read_pickle(master_path)
    except OSError:
        print('\nMaster list not found.  Run build_program_files script?\n\n' +
              'Skeleton build failed.\n\n' +
              '  >>> exiting routine.\n')
        import sys
        sys.exit()

    output_name = 'skeleton'
    skel_path_string = (pre + output_name + suf)

    sdict = pd.read_pickle('dill/dict_settings.pkl')

    # only include pilots that are not retired prior to the starting_month
    start_date = sdict['starting_date']

    df_mlist = df_mlist[
        df_mlist.retdate >= start_date - pd.DateOffset(months=1)]

    # include furloughees by default
    df = df_mlist[(df_mlist.line == 1) | (df_mlist.fur == 1)].copy()

    df_mlist = []

    # MNUM*
    # calculate the number of career months for each employee (short_form)
    # cmonths is used for mnum, idx, and mth_pcnt calculations

    cmonths = f.career_months(df, start_date)
    # convert the python cmonths list to a numpy array and
    # use that array as input for the count_per_month function.
    # The count_per_month function output array is input for
    # other functions (month_form)

    nonret_each_month = f.count_per_month(cmonths)

    # first long form data generation.
    # month numbers, same month number repeated for each
    # month length (long_form)

    long_form_skeleton = f.gen_month_skeleton(nonret_each_month)

    # this is making a dataframe out of the
    # long_form_skeleton (months) created above.
    # this is the basis for the long_form dataframe...

    # MNUM
    # (month number)

    skel = pd.DataFrame(long_form_skeleton.astype(int), columns=['mnum'])

    # IDX*
    # grab emp index for each remaining
    # employee for each month - used for merging dfs later

    empkey_arr = df.empkey.values

    long_index, long_emp = f.gen_skel_emp_idx(nonret_each_month,
                                              cmonths, empkey_arr)

    # IDX
    skel['idx'] = long_index.astype(int)

    # EMPKEY
    skel['empkey'] = long_emp.astype(int)

    # grab retdates from df column (short_form)
    # used for mth_pcnt and age calc (also mapping retdates)
    dobs = list(df['dob'])

    df_last = pd.read_pickle('dill/last_month.pkl')

    df.set_index('retdate', inplace=True)
    df['lmonth_pcnt'] = df_last.last_pay
    df.reset_index(inplace=True)
    df.set_index('empkey', inplace=True, verify_integrity=False, drop=False)

    lmonth_pcnt = df.lmonth_pcnt.values

    df_dict = {'mth_pcnt': lmonth_pcnt, 'final_month': cmonths}

    df_last_month = pd.DataFrame(df_dict)

    df_last_month['idx'] = df_last_month.index

    df_last_month.set_index(['idx', 'final_month'], inplace=True)

    skel = pd.merge(skel, df_last_month, right_index=True,
                    left_on=['idx', 'mnum'], how='outer')

    # MTH_PCNT
    skel['mth_pcnt'] = skel.mth_pcnt.fillna(1)

    # DATE, YEAR, PAY RAISE*

    # set up date_range - end of month dates

    df_dates = pd.DataFrame(pd.date_range(start_date,
                                          periods=len(nonret_each_month),
                                          freq='M'), columns=['date'])

    # this function produces a 2-column array.
    # First column is the year value of the date list passed as an input.
    # The second column is either 1.0 or
    # a calculated percentage pay raise after the last contract year.

    if sdict['compute_pay_measures']:
        df_dates = f.contract_year_and_raise(df_dates, sdict)

    # the merge below brings in 3 columns - date, year, and pay_raise
    # - from month_form to long_form

    # DATE, YEAR, PAY RAISE
    skel = pd.merge(skel, df_dates, right_index=True, left_on=['mnum'])

    # AGE, SCALE*
    # calculate and assign starting age and
    # starting longevity.
    # Assign to columns in df and then data align merge into skeleton df.
    # These columns are used later for age and scale calculations.
    # Merged here so that they could be done together
    # after setting indexes to match.

    s_age = f.starting_age(dobs, start_date)
    df['s_age'] = s_age

    # data alignment magic...set index to empkey
    skel.set_index('empkey', inplace=True, verify_integrity=False, drop=False)

    # AGE, RETDATE, EG, DOH, LDATE, LNAME,
    # FUR, RET_MONTH to long_form skeleton
    skel['s_age'] = df.s_age
    skel['fur'] = df.fur

    if sdict['add_eg_col']:
        skel['eg'] = df.eg
    if sdict['add_retdate_col']:
        skel['retdate'] = df.retdate
    if sdict['add_doh_col']:
        skel['doh'] = df.doh
    if sdict['add_ldate_col']:
        skel['ldate'] = df.ldate
    if sdict['add_lname_col']:
        skel['lname'] = df.lname
    if sdict['add_line_col']:
        skel['line'] = df.line
    if sdict['add_sg_col']:
        skel['sg'] = df.sg

    # RET_MARK
    # add last month number to df
    df['ret_month'] = cmonths
    # data align to long-form skel
    skel['ret_mark'] = df.ret_month
    mnums = skel.mnum.values
    lmonth_arr = np.zeros(mnums.size).astype(int)
    ret_month = skel.ret_mark.values
    # mark array where retirement month is equal to month number
    np.put(lmonth_arr, np.where(ret_month == mnums)[0], 1)
    skel['ret_mark'] = lmonth_arr

    # SCALE*

    if sdict['compute_pay_measures']:

        df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']),
                                                  start_date)
        skel['s_lyears'] = df.s_lyears

        month_inc = (1 / 12)

        # scale is payrate longevity level
        # compute scale for each employee for each month
        # begin with s_lyears (starting longevity years)
        # add a monthly increment based on the month number (mnum)
        # convert to an integer which rounds toward zero
        # clip to min of 1 and max of top_of_scale (max pay longevity scale)
        skel['scale'] = np.clip(((skel['mnum'] * month_inc) +
                                skel['s_lyears']).astype(int),
                                1,
                                sdict['top_of_scale'])
        skel.pop('s_lyears')

        # this column is only used for calculating furloughed employee pay
        # longevity in compute_measures routine.
        # ...could be an option if recalls are not part of model
        df['s_lmonths'] = f.longevity_at_startdate(list(df['ldate']),
                                                   sdict['starting_date'],
                                                   return_as_months=True)
        skel['s_lmonths'] = df.s_lmonths

    # AGE

    # calculate monthly age using starting age and month number

    age_list = skel.s_age.values

    corr_ages = f.age_correction(long_form_skeleton,
                                 age_list,
                                 sdict['ret_age'])

    if sdict['ret_age_increase']:
        skel['age'] = f.clip_ret_ages(sdict['ret_incr_dict'],
                                      sdict['init_ret_age'],
                                      skel.date.values, corr_ages)
    else:
        skel['age'] = corr_ages

    skel.pop('s_age')

    # empkey index (keep empkey column)
    # this is for easy data alignment with different list order keys

    # save results to pickle
    if sdict['save_to_pickle']:
        skel.to_pickle(skel_path_string)
def prepare_master_list(name_int_demo=False, pre_sort=True):
    '''Add attribute columns to a master list.  One or more of these columns
    will be used by the build_list function to construct
    a "hybrid" list ordering.

    Employee groups must be listed in seniority order in relation to employees
    from the same group.  Order between groups is uninmportant at this step.

    New columns added: ['age', 's_lmonths', 'jnum', 'job_count', 'rank_in_job',
    'jobp', 'eg_number', 'eg_spcnt']

    inputs

        name_int_demo
            if True, lname strings are converted to an integer then a
            corresponding alpha-numeric percentage for constructing lists by
            last name.  This is a demo only to show that any attribute
            may be used as a list weighting factor.

        pre_sort
            sort the master data dataframe doh and ldate columns prior to
            beginning any calculations.  This sort has no effect on the other
            columns.  The s_lmonths coulumn will be calculated on the sorted
            ldate data.

    Job-related attributes are referenced to job counts from the config file.
    '''

    master_ = pd.read_pickle('dill/master.pkl')

    if pre_sort:
        sort_eg_attributes(master_)

    master = master_[(master_.line == 1) | (master_.fur == 1)].copy()

    # AGE and LONGEVITY
    master['age'] = f.starting_age(master.retdate)
    master['s_lmonths'] = f.longevity_at_startdate(list(master['ldate']),
                                                   return_months=True)

    jobs_list = []

    if cf.enhanced_jobs:
        eg_counts = f.convert_jcnts_to_enhanced(cf.eg_counts,
                                                cf.full_time_pcnt1,
                                                cf.full_time_pcnt2)
    else:
        eg_counts = cf.eg_counts

    # make a list of stovepipe jobs for each group (from config job counts)
    i = 1
    for jobs in eg_counts:
        # the second input determines the length of the zero
        # array formed (possible excess)
        jobs_list.append(
            f.make_stovepipe_jobs_from_jobs_arr(jobs,
                                                sum((master.eg == i) &
                                                    ((master.line == 1) |
                                                     (master.fur == 1)))))
        i += 1

    fur_level = f.max_of_nested_lists(jobs_list) + 1
    jobs = np.array(jobs_list)

    # mark unassigned as furloughed (from zero to fur_level)
    for job_arr in jobs:
        np.put(job_arr, np.where(job_arr == 0)[0], fur_level)

    egs = np.array(master.eg)
    jnums = np.zeros(egs.size)
    job_count = np.zeros(egs.size)

    # JNUM and JOB_COUNT data prep
    i = 1
    for job_arr in jobs:
        data = np.unique(job_arr, return_counts=True)
        zipped = zip(data[0], data[1])
        for job, count in zipped:
            np.put(job_count,
                   np.where((jnums == 0) & (egs == i))[0][:count],
                   count)
            np.put(jnums, np.where((jnums == 0) & (egs == i))[0][:count], job)
        i += 1

    # Employee group count (for spcnt column)
    eg_counts = np.zeros(egs.size)
    data = np.unique(master.eg, return_counts=True)
    zipped = zip(data[0], data[1])
    for eg, count in zipped:
        np.put(eg_counts, np.where(egs == eg)[0], count)

    # Attribute columns assignment
    master['jnum'] = jnums.astype(int)
    master['job_count'] = job_count.astype(int)
    master['rank_in_job'] = master.groupby(['eg', 'jnum']).cumcount() + 1
    master['jobp'] = (master.rank_in_job /
                      master.job_count) + master.jnum - .0001
    master['eg_number'] = master.groupby('eg').cumcount() + 1
    master['eg_count'] = eg_counts.astype(int)
    master['eg_spcnt'] = master.eg_number / master.eg_count
    if name_int_demo:
        master['name_int'] = names_to_integers(master.lname)[2]

    master.pop('eg_count')

    return master
Beispiel #4
0
def main():

    # read prepared list dataframe - proper column headers, column formats...
    # this is master.pkl, order-independent, concatenated list data
    pre, suf = 'dill/', '.pkl'
    master_list = 'master'
    master_path = (pre + master_list + suf)

    try:
        df_mlist = pd.read_pickle(master_path)
    except OSError:
        print('\nMaster list not found.  Run build_program_files script?\n\n' +
              'Skeleton build failed.\n\n' + '  >>> exiting routine.\n')
        import sys
        sys.exit()

    output_name = 'skeleton'
    skel_path_string = (pre + output_name + suf)

    sdict = pd.read_pickle('dill/dict_settings.pkl')

    # only include pilots that are not retired prior to the starting_month
    start_date = sdict['starting_date']

    df_mlist = df_mlist[df_mlist.retdate >= start_date -
                        pd.DateOffset(months=1)]

    # include furloughees by default
    df = df_mlist[(df_mlist.line == 1) | (df_mlist.fur == 1)].copy()

    df_mlist = []

    # MNUM*
    # calculate the number of career months for each employee (short_form)
    # cmonths is used for mnum, idx, and mth_pcnt calculations

    cmonths = f.career_months(df, start_date)
    # convert the python cmonths list to a numpy array and
    # use that array as input for the count_per_month function.
    # The count_per_month function output array is input for
    # other functions (month_form)

    nonret_each_month = f.count_per_month(cmonths)

    # first long form data generation.
    # month numbers, same month number repeated for each
    # month length (long_form)

    long_form_skeleton = f.gen_month_skeleton(nonret_each_month)

    # this is making a dataframe out of the
    # long_form_skeleton (months) created above.
    # this is the basis for the long_form dataframe...

    # MNUM
    # (month number)

    skel = pd.DataFrame(long_form_skeleton.astype(int), columns=['mnum'])

    # IDX*
    # grab emp index for each remaining
    # employee for each month - used for merging dfs later

    empkey_arr = df.empkey.values

    long_index, long_emp = f.gen_skel_emp_idx(nonret_each_month, cmonths,
                                              empkey_arr)

    # IDX
    skel['idx'] = long_index.astype(int)

    # EMPKEY
    skel['empkey'] = long_emp.astype(int)

    # grab retdates from df column (short_form)
    # used for mth_pcnt and age calc (also mapping retdates)
    dobs = list(df['dob'])

    df_last = pd.read_pickle('dill/last_month.pkl')

    df.set_index('retdate', inplace=True)
    df['lmonth_pcnt'] = df_last.last_pay
    df.reset_index(inplace=True)
    df.set_index('empkey', inplace=True, verify_integrity=False, drop=False)

    lmonth_pcnt = df.lmonth_pcnt.values

    df_dict = {'mth_pcnt': lmonth_pcnt, 'final_month': cmonths}

    df_last_month = pd.DataFrame(df_dict)

    df_last_month['idx'] = df_last_month.index

    df_last_month.set_index(['idx', 'final_month'], inplace=True)

    skel = pd.merge(skel,
                    df_last_month,
                    right_index=True,
                    left_on=['idx', 'mnum'],
                    how='outer')

    # MTH_PCNT
    skel['mth_pcnt'] = skel.mth_pcnt.fillna(1)

    # DATE, YEAR, PAY RAISE*

    # set up date_range - end of month dates

    df_dates = pd.DataFrame(pd.date_range(start_date,
                                          periods=len(nonret_each_month),
                                          freq='M'),
                            columns=['date'])

    # this function produces a 2-column array.
    # First column is the year value of the date list passed as an input.
    # The second column is either 1.0 or
    # a calculated percentage pay raise after the last contract year.

    if sdict['compute_pay_measures']:
        df_dates = f.contract_year_and_raise(df_dates, sdict)

    # the merge below brings in 3 columns - date, year, and pay_raise
    # - from month_form to long_form

    # DATE, YEAR, PAY RAISE
    skel = pd.merge(skel, df_dates, right_index=True, left_on=['mnum'])

    # AGE, SCALE*
    # calculate and assign starting age and
    # starting longevity.
    # Assign to columns in df and then data align merge into skeleton df.
    # These columns are used later for age and scale calculations.
    # Merged here so that they could be done together
    # after setting indexes to match.

    s_age = f.starting_age(dobs, start_date)
    df['s_age'] = s_age

    # data alignment magic...set index to empkey
    skel.set_index('empkey', inplace=True, verify_integrity=False, drop=False)

    # AGE, RETDATE, EG, DOH, LDATE, LNAME,
    # FUR, RET_MONTH to long_form skeleton
    skel['s_age'] = df.s_age
    skel['fur'] = df.fur

    if sdict['add_eg_col']:
        skel['eg'] = df.eg
    if sdict['add_retdate_col']:
        skel['retdate'] = df.retdate
    if sdict['add_doh_col']:
        skel['doh'] = df.doh
    if sdict['add_ldate_col']:
        skel['ldate'] = df.ldate
    if sdict['add_lname_col']:
        skel['lname'] = df.lname
    if sdict['add_line_col']:
        skel['line'] = df.line
    if sdict['add_sg_col']:
        skel['sg'] = df.sg

    # RET_MARK
    # add last month number to df
    df['ret_month'] = cmonths
    # data align to long-form skel
    skel['ret_mark'] = df.ret_month
    mnums = skel.mnum.values
    lmonth_arr = np.zeros(mnums.size).astype(int)
    ret_month = skel.ret_mark.values
    # mark array where retirement month is equal to month number
    np.put(lmonth_arr, np.where(ret_month == mnums)[0], 1)
    skel['ret_mark'] = lmonth_arr

    # SCALE*

    if sdict['compute_pay_measures']:

        df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']),
                                                  start_date)
        skel['s_lyears'] = df.s_lyears

        month_inc = (1 / 12)

        # scale is payrate longevity level
        # compute scale for each employee for each month
        # begin with s_lyears (starting longevity years)
        # add a monthly increment based on the month number (mnum)
        # convert to an integer which rounds toward zero
        # clip to min of 1 and max of top_of_scale (max pay longevity scale)
        skel['scale'] = np.clip(
            ((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1,
            sdict['top_of_scale'])
        skel.pop('s_lyears')

        # this column is only used for calculating furloughed employee pay
        # longevity in compute_measures routine.
        # ...could be an option if recalls are not part of model
        df['s_lmonths'] = f.longevity_at_startdate(list(df['ldate']),
                                                   sdict['starting_date'],
                                                   return_as_months=True)
        skel['s_lmonths'] = df.s_lmonths

    # AGE

    # calculate monthly age using starting age and month number

    age_list = skel.s_age.values

    corr_ages = f.age_correction(long_form_skeleton, age_list,
                                 sdict['ret_age'])

    if sdict['ret_age_increase']:
        skel['age'] = f.clip_ret_ages(sdict['ret_incr_dict'],
                                      sdict['init_ret_age'], skel.date.values,
                                      corr_ages)
    else:
        skel['age'] = corr_ages

    skel.pop('s_age')

    # empkey index (keep empkey column)
    # this is for easy data alignment with different list order keys

    # save results to pickle
    if sdict['save_to_pickle']:
        skel.to_pickle(skel_path_string)