def estimated_population_size(person_year_table, areas_sample_table, profession, sampling_year_range): """ Estimate the total size of the profession for the years in sampling_year_range. Assumes that the ratio between the sum of the sampled appeals areas and the sum of the rest of the areas remained constant over the entire data period. :param person_year_table: a table of person-years, as a list of lists; assumes no header :param areas_sample_table: a table of person-years within the sampled areas, as a list of lists; assumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004) :return: dict of estimated population sizes: keys are years, value are estimates """ # get ratios of sample size to population size for for 2006, 2007, 2008. samp_size = totals_in_out.pop_cohort_counts(areas_sample_table, 1978, 2020, profession, cohorts=False, unit_type="nivel") pop_size = totals_in_out.pop_cohort_counts(person_year_table, 1978, 2020, profession, cohorts=False, unit_type="nivel") ratio_06 = samp_size["grand_total"][2006]["total_size"] / pop_size[ "grand_total"][2006]["total_size"] ratio_07 = samp_size["grand_total"][2007]["total_size"] / pop_size[ "grand_total"][2007]["total_size"] ratio_08 = samp_size["grand_total"][2008]["total_size"] / pop_size[ "grand_total"][2008]["total_size"] # average these ratios (across the three years) samp_to_pop_ratios = [ratio_06, ratio_07, ratio_08] avg_samp_to_pop_ratio = statistics.mean(samp_to_pop_ratios) # for each year, multiply the number of people in sample by reciprocal of sample size / population ratio: these are # the population estimate for the years in which we only have samples estim_pop = {} for year in samp_size["grand_total"]: if sampling_year_range[0] <= int(year) <= sampling_year_range[1]: estim_pop.update({ str(year): round( float(samp_size["grand_total"][year]["total_size"]) / float(avg_samp_to_pop_ratio)) }) return estim_pop
def entry_exit_unit_table(person_year_table, start_year, end_year, profession, unit_type, out_dir, entry=True): """ Make two csv tables, where the rows are units and the columns are years. For one table the cells are the total number of departures from that unit, for that year; for the other table, the cells are the percent of departures, relative to all people in that unit, in that year. NB: units can be geographic regions (e.g. notary "camera") or hierarchical level (e.g. tribunal level for judges) :param person_year_table: a table of person-years, as a list of lists :param start_year: int, the first year we consider :param end_year: int, the last year we consider :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param unit_type: string, type of the unit as it appears in header of person_year_table (e.g. "camera") :param entry: bool, True if entry cohorts, False if exit cohorts (i.e. everyone who left in year X) :param out_dir: directory where the table will live :return: None """ # if we look at entry cohorts avoid left censor and include right censor (which function ignores by default) if entry: start_year += 1 end_year += 1 # get data on cohorts by year and unit cohorts_per_unit = totals_in_out.pop_cohort_counts(person_year_table, start_year, end_year, profession, cohorts=True, unit_type=unit_type, entry=entry) # write the table to disk type_of_cohort = 'entry' if entry else 'departure' out_path = out_dir + profession + '_' + type_of_cohort + '_' + unit_type + '_rates.csv' with open(out_path, 'w') as o_file: fieldnames = ['unit'] + list( range(start_year, end_year)) # omit last year: all leave in right censor year writer = csv.DictWriter(o_file, fieldnames=fieldnames) writer.writeheader() # iterate over units for unit, years in cohorts_per_unit.items(): percent_row = {'unit': unit} count_row = {'unit': ''} # iterate over the years: for year, measures in years.items(): if start_year <= int( year) <= end_year - 1: # stay within bounds percent_row.update({year: measures['chrt_prcnt_of_pop']}) count_row.update( {year: '(' + str(measures['total_size']) + ')'}) # display the count row under the percent row writer.writerow(percent_row) writer.writerow(count_row)
def estimated_population_size(person_year_table, profession): """ To estimate the total size of the profession (i.e. of the population) for years in which we only have a sample, estimate the ratio between population and samples size for years in which we DO have the whole population, then for years with samples only multiply this population inflation ratio by the observed sample size. To be exact population inflation ratio = population size / sample size estimates population size = sample size * population inflation ratio NB: this assumes that the ratio between the total population and the sum of the sampled areas is roughly constant across the years whose population sizes we're estimating. :param person_year_table: a table of person-years, as a list of lists; assumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :return: dict of estimated population sizes: keys are years, value are estimates """ samp_yrs, samp_as, ratio_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] pop_yrs, total_yrs = pop_yr_range[profession], total_range[profession] areas_sample_table = sample.appellate_area_sample(person_year_table, profession, samp_as) # get ratio of sample size to population size for desired years, average across said years samp_size = totals_in_out.pop_cohort_counts(areas_sample_table, total_yrs[0], total_yrs[1], profession, cohorts=False, unit_type="nivel") pop_size = totals_in_out.pop_cohort_counts(person_year_table, pop_yrs[0], pop_yrs[1], profession, cohorts=False, unit_type="nivel") avg_samp_size, avg_total_size = 0, 0 for r_yr in ratio_yrs: avg_samp_size += samp_size["grand_total"][r_yr]["total_size"] avg_total_size += pop_size["grand_total"][r_yr]["total_size"] avg_samp_size = float(avg_samp_size) / float(len(ratio_yrs)) avg_total_size = float(avg_total_size) / float(len(ratio_yrs)) pop_inflation_ratio = avg_total_size / avg_samp_size # for each year in which we only have samples, multiply the number of people in sample by the population inflation # ratio; these are the population estimates for the years in which we only have samples. Round to 4 decimals. estim_pop = {} for yr in range(samp_yrs[0], samp_yrs[1] + 1): estim_pop.update({yr: round(float(samp_size["grand_total"][yr]["total_size"] * pop_inflation_ratio), 4)}) return estim_pop
def adjusted_retirement_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of retirement is that it does not distinguish between people who genuinely leave the profession and those who simply leave the sample (i.e. move to another area) but remain in the profession. Consequently, raw sample retirement counts are biased upwards because profession-exits and sample-exits are implicitly equated. The solution is to use the total population to compute the fraction of retirements from the sample area that are genuine departures from the profession and then to multiply the raw sample retirement count by that fraction, thereby reducing the upward bias. To be exact, the genuine retirement fraction is computed by genuine retirement fraction = genuine retirement counts / (genuine retirement counts + sample-leaving counts) and the adjusted retirement count will therefore be adjusted number of retirements = raw sample retirement count * genuine retirement fraction :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted retirement counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 ret_fracts = {lvl: {"gen_rets": 0, "samp_leaves": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction, and within the sampling areas if int(current_yr) in fracts_yrs and current_area in samp_as: if idx < len(person) - 1: # since we do look-aheads to see departures-cum-retirements # if next year's area is NOT within the sampling area, increment sample departures if person[idx + 1][ca_cod_idx] not in samp_as: ret_fracts[current_lvl]["samp_leaves"] += 1 # if last year is used for the fraction and within the sampling areas, increment genuine retirements else: # NB: this always assume we pick a sampling year than is less than the right censoring year ret_fracts[current_lvl]["gen_rets"] += 1 # average over the years then get the final fraction, per level for lvl in ret_fracts: avg_gen_rets = float(ret_fracts[lvl]["gen_rets"]) / float(len(fracts_yrs)) avg_samp_leave_rets = float(ret_fracts[lvl]["samp_leaves"]) / float(len(fracts_yrs)) ret_fracts[lvl] = helpers.weird_division(avg_gen_rets, (avg_gen_rets + avg_samp_leave_rets), mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_ret_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession, cohorts=True, unit_type="nivel", entry=False) samp_ret_counts.pop("grand_total") # don't need the grand total # and weigh them; round result to four decimals for lvl in samp_ret_counts: for yr in samp_ret_counts[lvl]: samp_ret_counts[lvl][yr] = round(samp_ret_counts[lvl][yr]["total_size"] * ret_fracts[int(lvl)], 4) if weights: return ret_fracts else: return samp_ret_counts
def adjusted_entry_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of entries is that it does not distinguish between people who are genuinely new recruits to the profession, and those who were already in the profession but outside the sample. Consequently, the raw count is biased upwards because it equates entering the sample from within the profession with entering the profession tout-court. The solution is to use the total population to compute the fraction of entries into the sample that are genuine recruits into the profession and then to multiply the raw sample entry count by that fraction, thereby reducing the upward bias. To be exact, the genuine entry fraction is computed by genuine entry fraction = genuine entry counts / (genuine entry counts + sample-entering counts) and the adjusted entry count will therefore be adjusted number entries = sample entry count * genuine entry fraction :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted entry counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 ent_fracts = {lvl: {"gen_ents": 0, "samp_ents": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction and this year is within the sample area if int(current_yr) in fracts_yrs and current_area in samp_as: # if it's genuinely the first year, increment genuine entries # NB: this always assumes that we skip the left censor year if idx == 0: # the first year of the career; ent_fracts[current_lvl]["gen_ents"] += 1 if 1 < idx: # since we do look-behinds to see if someone entered the sample from elsewhere # if LAST year's appellate area is different from this year's appellate area, increment count of # extra-sample entries if current_area != person[idx - 1][ca_cod_idx]: ent_fracts[current_lvl]["samp_ents"] += 1 # average over the years then get the final fraction, per level for lvl in ent_fracts: avg_gen_ents = float(ent_fracts[lvl]["gen_ents"]) / float(len(fracts_yrs)) avg_samp_ents = float(ent_fracts[lvl]["samp_ents"]) / float(len(fracts_yrs)) ent_fracts[lvl] = helpers.weird_division(avg_gen_ents, (avg_gen_ents + avg_samp_ents), mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_ent_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession, cohorts=True, unit_type="nivel", entry=True) samp_ent_counts.pop("grand_total") # don't need the grand total # and weigh them; round result to four decimals for lvl in samp_ent_counts: for yr in samp_ent_counts[lvl]: samp_ent_counts[lvl][yr] = round(samp_ent_counts[lvl][yr]["total_size"] * ent_fracts[int(lvl)], 4) if weights: return ent_fracts else: return samp_ent_counts
def get_raw_counts(person_year_table, profession, sampling_year_range): """ Get counts, in different years/intervals, of: number of professionals, entries, retirements, promotions. Keep only data for years in the year_range :param person_year_table: a table of person-years, as a list of lists; comes with header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004) :return: dict of dicts, where top-level dicts indicate nature of count (e.g. "retirements") and bottom level dict shows count of retirements per judicial level (e.g. lvl2, i.e. tribunals) """ total_counts = totals_in_out.pop_cohort_counts(person_year_table, 1978, 2020, profession, cohorts=False, unit_type="nivel") entries = totals_in_out.pop_cohort_counts(person_year_table, 1978, 2020, profession, cohorts=True, unit_type="nivel", entry=True) retirements = totals_in_out.pop_cohort_counts(person_year_table, 1978, 2020, profession, cohorts=True, unit_type="nivel", entry=False) promotions = hierarchical.hierarchical_mobility(person_year_table, profession) years = list(promotions.keys()) # keep only level data for years within the specified year range retirements.pop("grand_total"), total_counts.pop("grand_total") for yr in years: # toss out extraneous years for promotions if int(yr) < sampling_year_range[0] or int( yr) > sampling_year_range[1]: if yr in promotions: promotions.pop(yr) # toss out extraneous years for retirements and total counts for key in retirements: if int(yr) < sampling_year_range[0] or int( yr) > sampling_year_range[1]: if int(yr) in retirements[key]: entries[key].pop(int(yr)) retirements[key].pop(int(yr)) total_counts[key].pop(int(yr)) # for entries retirements and total counts, only keep total size for that year # keys of promotion are the years, apply to all dicts ent_counts = {year: {"1": 0, "2": 0, "3": 0} for year in promotions.keys()} ret_counts = {year: {"1": 0, "2": 0, "3": 0} for year in promotions.keys()} tot_counts = {year: {"1": 0, "2": 0, "3": 0} for year in promotions.keys()} for lvl in retirements: for year in retirements[lvl]: ent_counts[str(year)][lvl] = entries[lvl][year]["total_size"] ret_counts[str(year)][lvl] = retirements[lvl][year]["total_size"] tot_counts[str(year)][lvl] = total_counts[lvl][year]["total_size"] # for promotions prom_counts = { year: { "1": 0, "2": 0, "3": 0 } for year in promotions.keys() } for year in promotions: for lvl in promotions[year]: prom_counts[year][str(lvl)] = promotions[year][lvl]["up"]["total"] return { "entries": ent_counts, "retirements": ret_counts, "promotions": prom_counts, "total counts": tot_counts }
def entry_exit_gender(person_year_table, start_year, end_year, profession, out_dir, entry=True, unit_type=None): """ Make a table that shows the count and percentage of entry and exit cohorts for each gender, and for each unit if applicable. :param person_year_table: a table of person-years, as a list of lists :param start_year: int, the first year we consider :param end_year: int, the last year we consider :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param out_dir: directory where the table will live :param unit_type: None, or if provided, a string indicating the type of unit (e.g. appellate court region) :param entry: bool, True if entry cohorts, False if exit cohorts (i.e. everyone who left in year X) :return: None """ type_of_cohort = 'entry' if entry else 'departure' if unit_type: out_path = out_dir + profession + '_' + unit_type + '_' + type_of_cohort + '_cohorts_gender.csv' fieldnames = ["unit"] + ["year"] + [ "female", "male", "don't know", "total count", "percent female" ] cohorts = totals_in_out.pop_cohort_counts(person_year_table, start_year, end_year, profession, cohorts=True, unit_type=unit_type, entry=entry) else: out_path = out_dir + profession + '_' + type_of_cohort + '_cohorts_gender.csv' fieldnames = ["year"] + [ "female", "male", "don't know", "total count", "percent female" ] cohorts = totals_in_out.pop_cohort_counts(person_year_table, start_year, end_year, profession, cohorts=True, unit_type=None, entry=entry) # write table to disc with open(out_path, 'w') as o_file: writer = csv.DictWriter(o_file, fieldnames=fieldnames) writer.writeheader() # if we're given unit types if unit_type: # iterate over units for unit, years in cohorts.items(): if unit != 'grand_total': # iterate over the years: for year, metrics in years.items(): if start_year <= int( year) <= end_year - 1: # stay within bounds writer.writerow({ "unit": unit, "year": year, "female": metrics['f'], "male": metrics["m"], "don't know": metrics['dk'], "total count": metrics['total_size'], "percent female": metrics['percent_female'] }) else: # no units, just straight years for year, metrics in cohorts['grand_total'].items(): writer.writerow({ "year": year, "female": metrics['f'], "male": metrics["m"], "don't know": metrics['dk'], "total count": metrics['total_size'], "percent female": metrics['percent_female'] })
def year_counts_table(person_year_table, start_year, end_year, profession, out_dir, unit_type=None): """ Makes a table of yearly population counts, and optionally breaks down total counts by unit_type. :param person_year_table: a table of person-years, as a list of lists :param start_year: int, the first year we consider :param end_year: int, the last year we consider :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param out_dir: directory where the table will live :param unit_type: None, or if provided, a string indicating the type of unit (e.g. appellate court region) :return: None """ if unit_type: out_path = out_dir + profession + '_' + unit_type + '_year_totals.csv' fieldnames = ["unit"] + ["year"] + [ "female", "male", "don't know", "total count", "percent female" ] year_metrics = totals_in_out.pop_cohort_counts(person_year_table, start_year, end_year, profession, cohorts=False, unit_type=unit_type) else: out_path = out_dir + profession + '_year_totals.csv' fieldnames = ["year"] + [ "female", "male", "don't know", "total count", "percent female" ] year_metrics = totals_in_out.pop_cohort_counts(person_year_table, start_year, end_year, profession, cohorts=False) # make table and write to disk with open(out_path, 'w') as o_file: writer = csv.DictWriter(o_file, fieldnames=fieldnames) writer.writeheader() if unit_type: # iterate over units for unit, years in year_metrics.items(): if unit != 'grand_total': # iterate over years: for year, metrics in years.items(): if start_year <= int( year) <= end_year: # stay within bounds writer.writerow({ "unit": unit, "year": year, "female": metrics['f'], "male": metrics["m"], "don't know": metrics['dk'], "total count": metrics['total_size'], "percent female": metrics['percent_female'] }) else: # no units, just straight years for year, metrics in year_metrics['grand_total'].items(): writer.writerow({ "year": year, "female": metrics['f'], "male": metrics["m"], "don't know": metrics['dk'], "total count": metrics['total_size'], "percent female": metrics['percent_female'] }) # finally, show which appeals and tribunal areas were sampled ca_col_idx = helpers.get_header(profession, 'preprocess').index('ca cod') trib_col_idx = helpers.get_header(profession, 'preprocess').index('trib cod') ca_areas = sorted(list({py[ca_col_idx] for py in person_year_table})) tb_areas = sorted(list({py[trib_col_idx] for py in person_year_table})) writer.writerow({"year": ''}) writer.writerow({ "year": "SAMPLED COURT OF APPEALS AREAS", "female": ca_areas }) writer.writerow({"year": "SAMPLED TRIBUNAL AREAS", "female": tb_areas})
def make_vacancy_transition_tables(person_year_table, profession, out_dir, years, averaging_years=None, area_samp=False, out_dir_area_samp=None): """ Make a csv containing one sub-table for each of the years that we select, with each sub-table showing the transition probabilites between hiearchical levels of vacancies. Optionally, we may also include a table that averages across desired years. e.g. 1984-1989. Each sub-table should be NxN+1, where N = number of levels, and the last column represents vacancies leaving the system, i.e. people being recruited into the system. NB: diagonals signify mobility WITHIN the level :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param out_dir: str, the path to where the transition matrices will live :param years: list of ints, the years for which we want vacancy probability transition matrixes :param averaging_years: list of ints over which we want to average vacancy frequency tables, e.g. [1985, 1986, 1987] :param area_samp: bool,True if we want to sample from specific areas :param out_dir_area_samp: if given, str showing the out-directory where we want the vacancy transition tables for the sample areas to live :return: None """ averaging_years = averaging_years if averaging_years else [] # if no averaging years provided, make empty list sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) proms_weights, demos_weights, transfs_weights = None, None, None # throws up errors if things go awry # get entry counts, in easy format entry_counts = totals_in_out.pop_cohort_counts(sorted_person_year_table, years[0], years[-1], profession, cohorts=True, unit_type="nivel", entry=True) entry_counts.pop("grand_total") # don't need the grand total for lvl in entry_counts: for yr in entry_counts[lvl]: entry_counts[lvl][yr] = entry_counts[lvl][yr]["total_size"] if area_samp: # I hard code these in since they change so rarely samp_areas = {"judges": ["CA1", "CA7", "CA9", "CA12", "-88"], "prosecutors": []} samp_yr_range = {"judges": [1980, 2003], "prosecutors": []} samp_yrs, samp_as = samp_yr_range[profession], samp_areas[profession] # get sample-adjusted entry counts and sample weights for mobility entry_counts = area_samples.adjusted_entry_counts(person_year_table, profession) proms_weights = area_samples.adjusted_promotion_counts(sorted_person_year_table, profession, weights=True) demos_weights = area_samples.adjusted_demotion_counts(sorted_person_year_table, profession, weights=True) transfs_weights = area_samples.adjusted_lateral_transfer_counts(sorted_person_year_table, profession, weights=True) # restrict person-year table to sampling areas sorted_person_year_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) # redirect the out-directory out_dir = out_dir_area_samp # get person-level transition frequencies levels trans_freqs = inter_level_transition_matrices(sorted_person_year_table, profession) with open(out_dir + "vacancy_probability_transition_matrixes.csv", "w") as out_f: writer = csv.writer(out_f) # this is unused if averaging years stays empty avg_vac_trans_mat = np.empty((4, 5), float) # for each sampling year for yr in years: # make array of zeros, for four levels; not all years have four levels, but zero rows/columns are harmless trans_mat = np.zeros((4, 4)) for lvl in range(1, 5): # for departure levels in the system, i.e. the level FROM which mobility happens if lvl in trans_freqs[yr]: # if the levels exist in that year (since some are added later) # now weigh the observed values # NB: route = mobility route, e.g. "1-2" means "mobility from level 1 to level 2" for route, mob_freq in trans_freqs[yr][lvl].items(): # ignore retirements, non-movements, sums, and discontinuities if route.split("-")[1].isdigit(): # level you leave and level you go to; -1 since numpy zero indexes departing, arriving = int(route.split("-")[0]) - 1, int(route.split("-")[1]) - 1 # get frequency counts and put them in the frequency matrix; if sampling, weigh the counts if departing < arriving: # promotions trans_mat[departing][arriving] = mob_freq if area_samp: trans_mat[departing][arriving] = round(mob_freq * proms_weights[lvl], 5) if departing == arriving: # lateral transfers trans_mat[departing][arriving] = mob_freq if area_samp: trans_mat[departing][arriving] = round(mob_freq * transfs_weights[lvl], 5) if departing > arriving: # demotions trans_mat[departing][arriving] = mob_freq if area_samp: trans_mat[departing][arriving] = round(mob_freq * demos_weights[lvl], 5) # transpose the person-level mobility frequency matrix to get the vacancy mobility matrix vac_trans_mat = np.transpose(trans_mat) # by convention, we thus far treated levels in incrementing order, i.e. level 1 < 2 < 3 < 4. The convention # in vacancy chains studies is that 1 > 2 > 3 > 4, and to get that we transpose the array along the # anti-diagonal/off-diagonal vac_trans_mat = vac_trans_mat[::-1, ::-1].T # in the last column we put vacancy "retirements", i.e. entries of people into the system entry_freqs = [entry_counts[str(level)][yr] for level in range(1, 5) if str(level) in entry_counts] entries_col = np.asarray(entry_freqs[::-1])[..., None] # give it Nx1 shape; reverse order for 1 > 2 > 3... vac_trans_mat = np.append(vac_trans_mat, entries_col, 1) if yr in averaging_years: avg_vac_trans_mat = np.add(avg_vac_trans_mat, vac_trans_mat) vac_prob_mat = freq_mat_to_prob_mat(vac_trans_mat.tolist(), round_to=5) # add that transition probability matrix to table writer.writerow([profession.upper(), yr]) header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"] writer.writerow(header) for i in range(len(vac_prob_mat)): writer.writerow([header[1:][i]] + vac_prob_mat[i]) writer.writerow(["\n"]) if averaging_years: avg_vac_trans_mat = np.divide(avg_vac_trans_mat, float(len(averaging_years) - 1)) avg_vac_prob_mat = freq_mat_to_prob_mat(avg_vac_trans_mat.tolist(), round_to=5) writer.writerow(["AVERAGED ACROSS YEARS"] + averaging_years) header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"] writer.writerow(header) for i in range(len(avg_vac_prob_mat)): writer.writerow([header[1:][i]] + avg_vac_prob_mat[i])