skel['ldate'] = df.ldate if cf.add_lname_col: skel['lname'] = df.lname if cf.add_line_col: skel['line'] = df.line if cf.add_twa_col: skel['twa'] = df.twa if not cf.actives_only: skel['fur'] = df.fur # SCALE* if cf.compute_pay_measures: df['s_lyears'] = f.longevity_at_startdate(list(df['ldate'])) skel['s_lyears'] = df.s_lyears month_inc = (1 / 12) # scale is payrate longevity level # compute scale for each employee for each month # begin with s_lyears (starting longevity years) # add a monthly increment based on the month number (mnum) # convert to an integer which rounds toward zero # clip to min of 1 and max of top_of_scale (max pay longevity scale) skel['scale'] = np.clip( ((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1, cf.top_of_scale) skel.pop('s_lyears')
def main(): # read prepared list dataframe - proper column headers, column formats... # this is master.pkl, order-independent, concatenated list data pre, suf = 'dill/', '.pkl' master_list = 'master' master_path = (pre + master_list + suf) try: df_mlist = pd.read_pickle(master_path) except OSError: print('\nMaster list not found. Run build_program_files script?\n\n' + 'Skeleton build failed.\n\n' + ' >>> exiting routine.\n') import sys sys.exit() output_name = 'skeleton' skel_path_string = (pre + output_name + suf) sdict = pd.read_pickle('dill/dict_settings.pkl') # only include pilots that are not retired prior to the starting_month start_date = sdict['starting_date'] df_mlist = df_mlist[ df_mlist.retdate >= start_date - pd.DateOffset(months=1)] # include furloughees by default df = df_mlist[(df_mlist.line == 1) | (df_mlist.fur == 1)].copy() df_mlist = [] # MNUM* # calculate the number of career months for each employee (short_form) # cmonths is used for mnum, idx, and mth_pcnt calculations cmonths = f.career_months(df, start_date) # convert the python cmonths list to a numpy array and # use that array as input for the count_per_month function. # The count_per_month function output array is input for # other functions (month_form) nonret_each_month = f.count_per_month(cmonths) # first long form data generation. # month numbers, same month number repeated for each # month length (long_form) long_form_skeleton = f.gen_month_skeleton(nonret_each_month) # this is making a dataframe out of the # long_form_skeleton (months) created above. # this is the basis for the long_form dataframe... # MNUM # (month number) skel = pd.DataFrame(long_form_skeleton.astype(int), columns=['mnum']) # IDX* # grab emp index for each remaining # employee for each month - used for merging dfs later empkey_arr = df.empkey.values long_index, long_emp = f.gen_skel_emp_idx(nonret_each_month, cmonths, empkey_arr) # IDX skel['idx'] = long_index.astype(int) # EMPKEY skel['empkey'] = long_emp.astype(int) # grab retdates from df column (short_form) # used for mth_pcnt and age calc (also mapping retdates) dobs = list(df['dob']) df_last = pd.read_pickle('dill/last_month.pkl') df.set_index('retdate', inplace=True) df['lmonth_pcnt'] = df_last.last_pay df.reset_index(inplace=True) df.set_index('empkey', inplace=True, verify_integrity=False, drop=False) lmonth_pcnt = df.lmonth_pcnt.values df_dict = {'mth_pcnt': lmonth_pcnt, 'final_month': cmonths} df_last_month = pd.DataFrame(df_dict) df_last_month['idx'] = df_last_month.index df_last_month.set_index(['idx', 'final_month'], inplace=True) skel = pd.merge(skel, df_last_month, right_index=True, left_on=['idx', 'mnum'], how='outer') # MTH_PCNT skel['mth_pcnt'] = skel.mth_pcnt.fillna(1) # DATE, YEAR, PAY RAISE* # set up date_range - end of month dates df_dates = pd.DataFrame(pd.date_range(start_date, periods=len(nonret_each_month), freq='M'), columns=['date']) # this function produces a 2-column array. # First column is the year value of the date list passed as an input. # The second column is either 1.0 or # a calculated percentage pay raise after the last contract year. if sdict['compute_pay_measures']: df_dates = f.contract_year_and_raise(df_dates, sdict) # the merge below brings in 3 columns - date, year, and pay_raise # - from month_form to long_form # DATE, YEAR, PAY RAISE skel = pd.merge(skel, df_dates, right_index=True, left_on=['mnum']) # AGE, SCALE* # calculate and assign starting age and # starting longevity. # Assign to columns in df and then data align merge into skeleton df. # These columns are used later for age and scale calculations. # Merged here so that they could be done together # after setting indexes to match. s_age = f.starting_age(dobs, start_date) df['s_age'] = s_age # data alignment magic...set index to empkey skel.set_index('empkey', inplace=True, verify_integrity=False, drop=False) # AGE, RETDATE, EG, DOH, LDATE, LNAME, # FUR, RET_MONTH to long_form skeleton skel['s_age'] = df.s_age skel['fur'] = df.fur if sdict['add_eg_col']: skel['eg'] = df.eg if sdict['add_retdate_col']: skel['retdate'] = df.retdate if sdict['add_doh_col']: skel['doh'] = df.doh if sdict['add_ldate_col']: skel['ldate'] = df.ldate if sdict['add_lname_col']: skel['lname'] = df.lname if sdict['add_line_col']: skel['line'] = df.line if sdict['add_sg_col']: skel['sg'] = df.sg # RET_MARK # add last month number to df df['ret_month'] = cmonths # data align to long-form skel skel['ret_mark'] = df.ret_month mnums = skel.mnum.values lmonth_arr = np.zeros(mnums.size).astype(int) ret_month = skel.ret_mark.values # mark array where retirement month is equal to month number np.put(lmonth_arr, np.where(ret_month == mnums)[0], 1) skel['ret_mark'] = lmonth_arr # SCALE* if sdict['compute_pay_measures']: df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']), start_date) skel['s_lyears'] = df.s_lyears month_inc = (1 / 12) # scale is payrate longevity level # compute scale for each employee for each month # begin with s_lyears (starting longevity years) # add a monthly increment based on the month number (mnum) # convert to an integer which rounds toward zero # clip to min of 1 and max of top_of_scale (max pay longevity scale) skel['scale'] = np.clip(((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1, sdict['top_of_scale']) skel.pop('s_lyears') # this column is only used for calculating furloughed employee pay # longevity in compute_measures routine. # ...could be an option if recalls are not part of model df['s_lmonths'] = f.longevity_at_startdate(list(df['ldate']), sdict['starting_date'], return_as_months=True) skel['s_lmonths'] = df.s_lmonths # AGE # calculate monthly age using starting age and month number age_list = skel.s_age.values corr_ages = f.age_correction(long_form_skeleton, age_list, sdict['ret_age']) if sdict['ret_age_increase']: skel['age'] = f.clip_ret_ages(sdict['ret_incr_dict'], sdict['init_ret_age'], skel.date.values, corr_ages) else: skel['age'] = corr_ages skel.pop('s_age') # empkey index (keep empkey column) # this is for easy data alignment with different list order keys # save results to pickle if sdict['save_to_pickle']: skel.to_pickle(skel_path_string)
def prepare_master_list(name_int_demo=False, pre_sort=True): '''Add attribute columns to a master list. One or more of these columns will be used by the build_list function to construct a "hybrid" list ordering. Employee groups must be listed in seniority order in relation to employees from the same group. Order between groups is uninmportant at this step. New columns added: ['age', 's_lmonths', 'jnum', 'job_count', 'rank_in_job', 'jobp', 'eg_number', 'eg_spcnt'] inputs name_int_demo if True, lname strings are converted to an integer then a corresponding alpha-numeric percentage for constructing lists by last name. This is a demo only to show that any attribute may be used as a list weighting factor. pre_sort sort the master data dataframe doh and ldate columns prior to beginning any calculations. This sort has no effect on the other columns. The s_lmonths coulumn will be calculated on the sorted ldate data. Job-related attributes are referenced to job counts from the config file. ''' master_ = pd.read_pickle('dill/master.pkl') if pre_sort: sort_eg_attributes(master_) master = master_[(master_.line == 1) | (master_.fur == 1)].copy() # AGE and LONGEVITY master['age'] = f.starting_age(master.retdate) master['s_lmonths'] = f.longevity_at_startdate(list(master['ldate']), return_months=True) jobs_list = [] if cf.enhanced_jobs: eg_counts = f.convert_jcnts_to_enhanced(cf.eg_counts, cf.full_time_pcnt1, cf.full_time_pcnt2) else: eg_counts = cf.eg_counts # make a list of stovepipe jobs for each group (from config job counts) i = 1 for jobs in eg_counts: # the second input determines the length of the zero # array formed (possible excess) jobs_list.append( f.make_stovepipe_jobs_from_jobs_arr(jobs, sum((master.eg == i) & ((master.line == 1) | (master.fur == 1))))) i += 1 fur_level = f.max_of_nested_lists(jobs_list) + 1 jobs = np.array(jobs_list) # mark unassigned as furloughed (from zero to fur_level) for job_arr in jobs: np.put(job_arr, np.where(job_arr == 0)[0], fur_level) egs = np.array(master.eg) jnums = np.zeros(egs.size) job_count = np.zeros(egs.size) # JNUM and JOB_COUNT data prep i = 1 for job_arr in jobs: data = np.unique(job_arr, return_counts=True) zipped = zip(data[0], data[1]) for job, count in zipped: np.put(job_count, np.where((jnums == 0) & (egs == i))[0][:count], count) np.put(jnums, np.where((jnums == 0) & (egs == i))[0][:count], job) i += 1 # Employee group count (for spcnt column) eg_counts = np.zeros(egs.size) data = np.unique(master.eg, return_counts=True) zipped = zip(data[0], data[1]) for eg, count in zipped: np.put(eg_counts, np.where(egs == eg)[0], count) # Attribute columns assignment master['jnum'] = jnums.astype(int) master['job_count'] = job_count.astype(int) master['rank_in_job'] = master.groupby(['eg', 'jnum']).cumcount() + 1 master['jobp'] = (master.rank_in_job / master.job_count) + master.jnum - .0001 master['eg_number'] = master.groupby('eg').cumcount() + 1 master['eg_count'] = eg_counts.astype(int) master['eg_spcnt'] = master.eg_number / master.eg_count if name_int_demo: master['name_int'] = names_to_integers(master.lname)[2] master.pop('eg_count') return master
def main(): # read prepared list dataframe - proper column headers, column formats... # this is master.pkl, order-independent, concatenated list data pre, suf = 'dill/', '.pkl' master_list = 'master' master_path = (pre + master_list + suf) try: df_mlist = pd.read_pickle(master_path) except OSError: print('\nMaster list not found. Run build_program_files script?\n\n' + 'Skeleton build failed.\n\n' + ' >>> exiting routine.\n') import sys sys.exit() output_name = 'skeleton' skel_path_string = (pre + output_name + suf) sdict = pd.read_pickle('dill/dict_settings.pkl') # only include pilots that are not retired prior to the starting_month start_date = sdict['starting_date'] df_mlist = df_mlist[df_mlist.retdate >= start_date - pd.DateOffset(months=1)] # include furloughees by default df = df_mlist[(df_mlist.line == 1) | (df_mlist.fur == 1)].copy() df_mlist = [] # MNUM* # calculate the number of career months for each employee (short_form) # cmonths is used for mnum, idx, and mth_pcnt calculations cmonths = f.career_months(df, start_date) # convert the python cmonths list to a numpy array and # use that array as input for the count_per_month function. # The count_per_month function output array is input for # other functions (month_form) nonret_each_month = f.count_per_month(cmonths) # first long form data generation. # month numbers, same month number repeated for each # month length (long_form) long_form_skeleton = f.gen_month_skeleton(nonret_each_month) # this is making a dataframe out of the # long_form_skeleton (months) created above. # this is the basis for the long_form dataframe... # MNUM # (month number) skel = pd.DataFrame(long_form_skeleton.astype(int), columns=['mnum']) # IDX* # grab emp index for each remaining # employee for each month - used for merging dfs later empkey_arr = df.empkey.values long_index, long_emp = f.gen_skel_emp_idx(nonret_each_month, cmonths, empkey_arr) # IDX skel['idx'] = long_index.astype(int) # EMPKEY skel['empkey'] = long_emp.astype(int) # grab retdates from df column (short_form) # used for mth_pcnt and age calc (also mapping retdates) dobs = list(df['dob']) df_last = pd.read_pickle('dill/last_month.pkl') df.set_index('retdate', inplace=True) df['lmonth_pcnt'] = df_last.last_pay df.reset_index(inplace=True) df.set_index('empkey', inplace=True, verify_integrity=False, drop=False) lmonth_pcnt = df.lmonth_pcnt.values df_dict = {'mth_pcnt': lmonth_pcnt, 'final_month': cmonths} df_last_month = pd.DataFrame(df_dict) df_last_month['idx'] = df_last_month.index df_last_month.set_index(['idx', 'final_month'], inplace=True) skel = pd.merge(skel, df_last_month, right_index=True, left_on=['idx', 'mnum'], how='outer') # MTH_PCNT skel['mth_pcnt'] = skel.mth_pcnt.fillna(1) # DATE, YEAR, PAY RAISE* # set up date_range - end of month dates df_dates = pd.DataFrame(pd.date_range(start_date, periods=len(nonret_each_month), freq='M'), columns=['date']) # this function produces a 2-column array. # First column is the year value of the date list passed as an input. # The second column is either 1.0 or # a calculated percentage pay raise after the last contract year. if sdict['compute_pay_measures']: df_dates = f.contract_year_and_raise(df_dates, sdict) # the merge below brings in 3 columns - date, year, and pay_raise # - from month_form to long_form # DATE, YEAR, PAY RAISE skel = pd.merge(skel, df_dates, right_index=True, left_on=['mnum']) # AGE, SCALE* # calculate and assign starting age and # starting longevity. # Assign to columns in df and then data align merge into skeleton df. # These columns are used later for age and scale calculations. # Merged here so that they could be done together # after setting indexes to match. s_age = f.starting_age(dobs, start_date) df['s_age'] = s_age # data alignment magic...set index to empkey skel.set_index('empkey', inplace=True, verify_integrity=False, drop=False) # AGE, RETDATE, EG, DOH, LDATE, LNAME, # FUR, RET_MONTH to long_form skeleton skel['s_age'] = df.s_age skel['fur'] = df.fur if sdict['add_eg_col']: skel['eg'] = df.eg if sdict['add_retdate_col']: skel['retdate'] = df.retdate if sdict['add_doh_col']: skel['doh'] = df.doh if sdict['add_ldate_col']: skel['ldate'] = df.ldate if sdict['add_lname_col']: skel['lname'] = df.lname if sdict['add_line_col']: skel['line'] = df.line if sdict['add_sg_col']: skel['sg'] = df.sg # RET_MARK # add last month number to df df['ret_month'] = cmonths # data align to long-form skel skel['ret_mark'] = df.ret_month mnums = skel.mnum.values lmonth_arr = np.zeros(mnums.size).astype(int) ret_month = skel.ret_mark.values # mark array where retirement month is equal to month number np.put(lmonth_arr, np.where(ret_month == mnums)[0], 1) skel['ret_mark'] = lmonth_arr # SCALE* if sdict['compute_pay_measures']: df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']), start_date) skel['s_lyears'] = df.s_lyears month_inc = (1 / 12) # scale is payrate longevity level # compute scale for each employee for each month # begin with s_lyears (starting longevity years) # add a monthly increment based on the month number (mnum) # convert to an integer which rounds toward zero # clip to min of 1 and max of top_of_scale (max pay longevity scale) skel['scale'] = np.clip( ((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1, sdict['top_of_scale']) skel.pop('s_lyears') # this column is only used for calculating furloughed employee pay # longevity in compute_measures routine. # ...could be an option if recalls are not part of model df['s_lmonths'] = f.longevity_at_startdate(list(df['ldate']), sdict['starting_date'], return_as_months=True) skel['s_lmonths'] = df.s_lmonths # AGE # calculate monthly age using starting age and month number age_list = skel.s_age.values corr_ages = f.age_correction(long_form_skeleton, age_list, sdict['ret_age']) if sdict['ret_age_increase']: skel['age'] = f.clip_ret_ages(sdict['ret_incr_dict'], sdict['init_ret_age'], skel.date.values, corr_ages) else: skel['age'] = corr_ages skel.pop('s_age') # empkey index (keep empkey column) # this is for easy data alignment with different list order keys # save results to pickle if sdict['save_to_pickle']: skel.to_pickle(skel_path_string)