def make_percent_pre_1990_table(person_year_table, profession, out_dir, out_dir_area_samp=None, area_sample=False): """ Make a table that shows for for every given year the percentage of people in the system who entered the system prior to 1990. This is meant to show rate of decrease of socialist-era judges. Percentages are disaggregated by judicial level, and dump them in a csv table. NB: when running this metric on the sample, this function assumes that entries and departures of pre-1990 people into the sample balance out, so that the sampling itself doesn't influence the before-to-after 1990 ratio. :param person_year_table: a table of person-years, as a list of lists; assumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești appeals areas/regions; False by default :param out_dir: str, directory where we want the non-area-sampled results table to live :param out_dir_area_samp: str, if given it's where we want the sample-area results table to live :return None """ if area_sample: appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"] # I hard code this in since it changes very rarely person_year_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample) out_dir = out_dir_area_samp # get handy column indexes yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort table by person and year, then group table by persons person_year_table = sorted(person_year_table, key=itemgetter(pid_col_idx, yr_col_idx)) people = [person for k, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))] # get the span of years years = sorted(list({int(py[yr_col_idx]) for py in person_year_table})) # initialise the nested dict holding the data, in three layers: hierarchical levels, list of counts b4_1990_year_dict = {i: {yr: {"before 1990": 0, "total count": 0} for yr in years} for i in range(1, 5)} for person in people: first_career_year = int(person[0][yr_col_idx]) for pers_yr in person: current_year = int(pers_yr[yr_col_idx]) current_level = int(pers_yr[lvl_col_idx]) b4_1990_year_dict[current_level][current_year]["total count"] += 1 if first_career_year <= 1990: b4_1990_year_dict[current_level][current_year]["before 1990"] += 1 # calculate percent from before 1990, only for 1990 and after (before 1990s it's always 100%) percs_lvl = {lvl: [] for lvl in b4_1990_year_dict} for lvl in b4_1990_year_dict: for yr in years: if yr >= 1990: percs_lvl[lvl].append(helpers.percent(b4_1990_year_dict[lvl][yr]["before 1990"], b4_1990_year_dict[lvl][yr]["total count"])) # write each level timeseries to disk with open(out_dir + "percent_pre_1990.csv", "w") as out_f: writer = csv.writer(out_f) writer.writerow(["Hierarchical Level"] + [yr for yr in years if yr >= 1990]) for lvl in b4_1990_year_dict: writer.writerow([lvl] + percs_lvl[lvl])
def estimated_population_size(person_year_table, profession): """ To estimate the total size of the profession (i.e. of the population) for years in which we only have a sample, estimate the ratio between population and samples size for years in which we DO have the whole population, then for years with samples only multiply this population inflation ratio by the observed sample size. To be exact population inflation ratio = population size / sample size estimates population size = sample size * population inflation ratio NB: this assumes that the ratio between the total population and the sum of the sampled areas is roughly constant across the years whose population sizes we're estimating. :param person_year_table: a table of person-years, as a list of lists; assumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :return: dict of estimated population sizes: keys are years, value are estimates """ samp_yrs, samp_as, ratio_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] pop_yrs, total_yrs = pop_yr_range[profession], total_range[profession] areas_sample_table = sample.appellate_area_sample(person_year_table, profession, samp_as) # get ratio of sample size to population size for desired years, average across said years samp_size = totals_in_out.pop_cohort_counts(areas_sample_table, total_yrs[0], total_yrs[1], profession, cohorts=False, unit_type="nivel") pop_size = totals_in_out.pop_cohort_counts(person_year_table, pop_yrs[0], pop_yrs[1], profession, cohorts=False, unit_type="nivel") avg_samp_size, avg_total_size = 0, 0 for r_yr in ratio_yrs: avg_samp_size += samp_size["grand_total"][r_yr]["total_size"] avg_total_size += pop_size["grand_total"][r_yr]["total_size"] avg_samp_size = float(avg_samp_size) / float(len(ratio_yrs)) avg_total_size = float(avg_total_size) / float(len(ratio_yrs)) pop_inflation_ratio = avg_total_size / avg_samp_size # for each year in which we only have samples, multiply the number of people in sample by the population inflation # ratio; these are the population estimates for the years in which we only have samples. Round to 4 decimals. estim_pop = {} for yr in range(samp_yrs[0], samp_yrs[1] + 1): estim_pop.update({yr: round(float(samp_size["grand_total"][yr]["total_size"] * pop_inflation_ratio), 4)}) return estim_pop
def adjusted_retirement_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of retirement is that it does not distinguish between people who genuinely leave the profession and those who simply leave the sample (i.e. move to another area) but remain in the profession. Consequently, raw sample retirement counts are biased upwards because profession-exits and sample-exits are implicitly equated. The solution is to use the total population to compute the fraction of retirements from the sample area that are genuine departures from the profession and then to multiply the raw sample retirement count by that fraction, thereby reducing the upward bias. To be exact, the genuine retirement fraction is computed by genuine retirement fraction = genuine retirement counts / (genuine retirement counts + sample-leaving counts) and the adjusted retirement count will therefore be adjusted number of retirements = raw sample retirement count * genuine retirement fraction :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted retirement counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 ret_fracts = {lvl: {"gen_rets": 0, "samp_leaves": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction, and within the sampling areas if int(current_yr) in fracts_yrs and current_area in samp_as: if idx < len(person) - 1: # since we do look-aheads to see departures-cum-retirements # if next year's area is NOT within the sampling area, increment sample departures if person[idx + 1][ca_cod_idx] not in samp_as: ret_fracts[current_lvl]["samp_leaves"] += 1 # if last year is used for the fraction and within the sampling areas, increment genuine retirements else: # NB: this always assume we pick a sampling year than is less than the right censoring year ret_fracts[current_lvl]["gen_rets"] += 1 # average over the years then get the final fraction, per level for lvl in ret_fracts: avg_gen_rets = float(ret_fracts[lvl]["gen_rets"]) / float(len(fracts_yrs)) avg_samp_leave_rets = float(ret_fracts[lvl]["samp_leaves"]) / float(len(fracts_yrs)) ret_fracts[lvl] = helpers.weird_division(avg_gen_rets, (avg_gen_rets + avg_samp_leave_rets), mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_ret_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession, cohorts=True, unit_type="nivel", entry=False) samp_ret_counts.pop("grand_total") # don't need the grand total # and weigh them; round result to four decimals for lvl in samp_ret_counts: for yr in samp_ret_counts[lvl]: samp_ret_counts[lvl][yr] = round(samp_ret_counts[lvl][yr]["total_size"] * ret_fracts[int(lvl)], 4) if weights: return ret_fracts else: return samp_ret_counts
def adjusted_lateral_transfer_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of lateral trasnfers is that it is biased downward, for two reasons: a) those who trasnfer laterally to a position outside the sample will appear have retired, thus biasing the lateral transfer count downward b) those who entered the sample via lateral transfer from outside the sample will appear to be new entrants, thus biasing the lateral transfer count downward Essentially, the sample only counts lateral transfers that occur within the sample, ignoring those lateral transfers that feature sample entry or departure. To fix this bias we use the total populating to compute the genuine fraction of lateral transfers, namely genuine promotion ratio = (within-sample lateral transfers + lateral transfers leaving the sample + lateral transfers entering the sample) / within-sample lateral transfers and the adjusted lateral transfer count will therefore be adjusted number of lateral transfers = within-sample lateral transfer count * genuine lateral transfer ratio :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted lateral transfer counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 trans_fracts = {lvl: {"within_samp_transfs": 0, "samp_leave_transfs": 0, "samp_ent_transfs": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction and this year is within the sample area if int(current_yr) in fracts_yrs and current_area in samp_as: if idx < len(person) - 1: # since we do look-aheads to judge mobility within or leaving the sample # if current hierarchical level is equal to NEXT year's AND the exact workplaces differ # (i.e. there's a lateral transfer this year): if current_lvl == int(person[idx + 1][lvl_col_idx]) and \ get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx + 1], profession): # if next year's area is outside the sample, increment count of leaving-sample transfers if person[idx + 1][ca_cod_idx] not in samp_as: trans_fracts[current_lvl]["samp_leave_transfs"] += 1 else: # if next year's area is within the sample, increment the count of within-sample demotions if person[idx + 1][ca_cod_idx] in samp_as: trans_fracts[current_lvl]["within_samp_transfs"] += 1 if 1 < idx: # we do look behinds to see if someone entered the sample from elsewhere: # if LAST year's hierarchical level was the same as this year's AND the exact workplaces different # (i.e. a lateral transfer occurred last year) if int(person[idx - 1][lvl_col_idx]) == current_lvl and \ get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx - 1], profession): # if last year's area was not within the sample, increment the count of extra-sample entries via # lateral transfer trans_fracts[current_lvl]["samp_ent_transfs"] += 1 # average over the years then get the final fraction, per level for lvl in trans_fracts: avg_within_samp_transfs = float(trans_fracts[lvl]["within_samp_transfs"]) / float(len(fracts_yrs)) avg_samp_leave_transfs = float(trans_fracts[lvl]["samp_leave_transfs"]) / float(len(fracts_yrs)) avg_samp_ent_transfs = float(trans_fracts[lvl]["samp_ent_transfs"]) / float(len(fracts_yrs)) trans_fracts[lvl] = helpers.weird_division((avg_within_samp_transfs + avg_samp_leave_transfs + avg_samp_ent_transfs), avg_within_samp_transfs, mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_transf_counts = hierarchical.hierarchical_mobility(cas_sample_table, profession) # and weigh them; round result to four decimals for yr in samp_transf_counts: for lvl in samp_transf_counts[yr]: samp_transf_counts[yr][lvl] = round(samp_transf_counts[yr][lvl]["across"]["total"] * trans_fracts[lvl], 4) if weights: return trans_fracts else: return samp_transf_counts
def adjusted_entry_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of entries is that it does not distinguish between people who are genuinely new recruits to the profession, and those who were already in the profession but outside the sample. Consequently, the raw count is biased upwards because it equates entering the sample from within the profession with entering the profession tout-court. The solution is to use the total population to compute the fraction of entries into the sample that are genuine recruits into the profession and then to multiply the raw sample entry count by that fraction, thereby reducing the upward bias. To be exact, the genuine entry fraction is computed by genuine entry fraction = genuine entry counts / (genuine entry counts + sample-entering counts) and the adjusted entry count will therefore be adjusted number entries = sample entry count * genuine entry fraction :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted entry counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 ent_fracts = {lvl: {"gen_ents": 0, "samp_ents": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction and this year is within the sample area if int(current_yr) in fracts_yrs and current_area in samp_as: # if it's genuinely the first year, increment genuine entries # NB: this always assumes that we skip the left censor year if idx == 0: # the first year of the career; ent_fracts[current_lvl]["gen_ents"] += 1 if 1 < idx: # since we do look-behinds to see if someone entered the sample from elsewhere # if LAST year's appellate area is different from this year's appellate area, increment count of # extra-sample entries if current_area != person[idx - 1][ca_cod_idx]: ent_fracts[current_lvl]["samp_ents"] += 1 # average over the years then get the final fraction, per level for lvl in ent_fracts: avg_gen_ents = float(ent_fracts[lvl]["gen_ents"]) / float(len(fracts_yrs)) avg_samp_ents = float(ent_fracts[lvl]["samp_ents"]) / float(len(fracts_yrs)) ent_fracts[lvl] = helpers.weird_division(avg_gen_ents, (avg_gen_ents + avg_samp_ents), mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_ent_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession, cohorts=True, unit_type="nivel", entry=True) samp_ent_counts.pop("grand_total") # don't need the grand total # and weigh them; round result to four decimals for lvl in samp_ent_counts: for yr in samp_ent_counts[lvl]: samp_ent_counts[lvl][yr] = round(samp_ent_counts[lvl][yr]["total_size"] * ent_fracts[int(lvl)], 4) if weights: return ent_fracts else: return samp_ent_counts
def avg_career_length(person_year_table, profession, area_sample=True): """ Print out yearly, average, per-level career length, so we can answer questions like "did tribunal (i.e. level 2 judges) become more experienced, on average, between 1995 and 2005? :param person_year_table: a table of person-years, as a list of lists; assumes no header :param profession: :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești appeals areas/regions :return: None """ # TODO if using the sample (and not whole population), need to top up estimates to account for the fact that # some people enter from outside the sample, so it might look like it's their first year, but really they've had # a longer career already if area_sample: appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"] person_year_table = sample.appellate_area_sample( person_year_table, profession, appellate_areas_to_sample) # add a career length count for each person, for each year # group table by persons yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') person_year_table = sorted(person_year_table, key=operator.itemgetter(pid_col_idx, yr_col_idx)) people = [ person for k, [*person] in groupby(person_year_table, key=operator.itemgetter(pid_col_idx)) ] # make an augmented table where the last year is the career length of that person, in that year # NB: by convention we 1-index, i.e. your career length is "1" in the first year for which we observe you # the career length column is the last one in the table augmented_table = [] for person in people: for idx, pers_yr in enumerate(person): augmented_table.append(pers_yr + [idx]) # for each year, get average career length per level years = sorted(list({int(py[yr_col_idx]) for py in augmented_table})) year_dict = {year: {"1": [], "2": [], "3": [], "4": []} for year in years} # sort and group augmented table by year lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') augmented_table.sort(key=operator.itemgetter(yr_col_idx)) years = [ yr for k, [*yr] in groupby(augmented_table, key=operator.itemgetter(yr_col_idx)) ] for yr_group in years: # recall that a year-group is made of person-years, all sharing the same year, e.g. 1996 current_year = int(yr_group[0][yr_col_idx]) # build the per-level person-year lists for each year, in the year_dict for pers_yr in yr_group: py_lvl = pers_yr[lvl_col_idx] year_dict[current_year][py_lvl].append(int(pers_yr[-1])) # get the level average for each year for yr in year_dict: for lvl in year_dict[yr]: if year_dict[yr][lvl]: # need to be careful, no lvl 3 before 1993 year_dict[yr][lvl] = round(statistics.mean(year_dict[yr][lvl]), 2) # print the results for yr in year_dict: print(yr, ' | ', year_dict[yr])
def retirement_promotion_estimates(person_year_table, profession, sampling_year_range, out_dir): """ Estimate how many people retire and move up the legal hierarchy (i.e. get promoted) every year, both in raw counts and relative to the population of people open to such retirement. Post-2005 we have the complete population of magistrates (i.e. judges and prosecutors) but pre-2005 we have only non-random samples. For judges I sample three appellate areas (Alba, Craiova, Iaşi, and Ploieşti) because I have yearly data on all courts in these areas since at least 1980. That said, mobility estimates from these samples need to be corrected. In particular, I look at three sorts of mobility: retirement, promotion, and entry. Post-2005 we are certain that someone retires when they are in the population in year X, but absent in year X+1. For the pre-2005 we can't be certain, because that person may have left the sample but stayed in the population, i.e. they have simply changed appellate area. I therefore correct sample estimates as follows: - for the intervals 2006-2007, 2007-2008, and 2008-2009, see how many magistrates in the sampled areas (Alba, Craiova, Iaşi, and Ploieşti) actually retired, and how many just left their respective area. Compute the ratio "retirement counts" / "retirement counts + area leaving counts" for each interval, and take the three-interval average. The result is a weight: X% of the people that departed the sampled areas actually retired. There is one ratio for each judicial level (i.e. low court, tribunal, and appeals). - for pre-2005 I count how many people left the sample, then multiply the per-level count by the appropriate weight. Obviously, this assumes that the ratio between retirements and area changes is constant over this period. I cannot numerically check that assumption. Regarding promotion, post-2005 we can just see if someone's judicial level increased between years. Pre-2005 this count will be based in the sample because a) those who receive a promotion outside the sample look show up as retirements, b) those who entered the sample upon promotion look like new entrants. To address this I construct two weights: the ratio of within-area promotions to total promotions, and the ratio of entrants-by-promotion to total entrants (per year, per level). The final count of (weighted) sample promotions is then computed as follows: raw count * 1 / within-total-ratio + count entrants * promotion-entrants-to-total-ratio Finally, to estimate the number of entrants, into the profession using the sample, I do the following: count entrants * (1 - promotion-entrants-to-total-ratio). Again, the assumption is that the relative balance of inter-area mobility flows is constant throughout the period under study, and therefore that ratios derived from 2006-2009 are true of other times as well. I choose the 2006-2009 period because it's a) the earliest population-level data, and b) this period did not feature major judicial reforms. Finally, also want estimates of the total size of the population, and of year-on-year population growth. :param person_year_table: a table of person-years, as a list of lists; NB: asumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004) :param out_dir: directory where tables of mobility estimates will live :return: None """ # get handy column indexes yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') # sort person-year table by person then year person_year_table.sort(key=operator.itemgetter(pid_col_idx, yr_col_idx)) # sample all courts in these appeals regions: Alba (CA1), Craiova (CA7), Iaşi (CA9), Ploieşti (CA12) appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"] cas_sample_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample) # get weights for retirement, promotion, and entry # for those appeals areas, for periods 2006-2007 and 2007-2008, per hierarchical level: # a) get ratio of within-area promotions (i.e. people who were already in the area) to total promotions # b) get ratio of retirements to retirements + out-of-area transfers # Average the values for 2006-07 and 2007-08: these will be weights for estimates from earlier years weights = three_year_average_weights(person_year_table, profession, appellate_areas_to_sample, ["2006", "2007", "2008"]) retirement_weights = weights["ret_weight"] internal_promotion_weights = weights["int_prom_weight"] external_promotion_weights = weights["ext_prom_weight"] # get raw counts of entries, retirements and promotions per year, per level, in the desired time-frame counts = get_raw_counts(cas_sample_table, profession, sampling_year_range) ent_counts, ret_counts, prom_counts = counts["entries"], counts[ "retirements"], counts["promotions"] # now weigh those counts with average ratios from 2006-2008. Recall (counts are from sample): # estimated retirements = retirement count * retirement weight # estimated promotions = promotion count * (1 / interior promotion weight) + entry count * external promotion weight # estimated entries = entry count * (1 - external promotion weight) for key in internal_promotion_weights: for year in ret_counts.keys(): # round up since these are whole people ret_counts[year][key] = round( float(ret_counts[year][key]) * retirement_weights[key]) prom_counts[year][key] = round( float( helpers.weird_division(prom_counts[year][key], internal_promotion_weights[key]) + float(ent_counts[year][key]) * external_promotion_weights[key])) ent_counts[year][key] = round( ent_counts[year][key] * (1 - external_promotion_weights[key])) # relabel, strictly for clarity (notice it's not a deepcopy) weighted_ret_counts = ret_counts weighted_prom_counts = prom_counts weighted_ent_counts = ent_counts # using (weighted-estiamted) sample counts, estimate yearly, per-level departure and retirement probabilities, where # denominator is sample count of person-years in year X; also estimate what proportion in each year's sample are # new entrants yearly_counts = counts["total counts"] retire_probs = { year: { "1": 0, "2": 0, "3": 0 } for year in yearly_counts.keys() } promotion_probs = { year: { "1": 0, "2": 0, "3": 0 } for year in yearly_counts.keys() } entry_proportions = { year: { "1": 0, "2": 0, "3": 0 } for year in yearly_counts.keys() } for year in yearly_counts: for lvl in yearly_counts[year]: promotion_probs[year][lvl] = helpers.weird_division( weighted_prom_counts[year][lvl], (yearly_counts[year][lvl])) retire_probs[year][lvl] = helpers.weird_division( weighted_ret_counts[year][lvl], yearly_counts[year][lvl]) # NB: entry proportions is simple: how many of this year's samples are newcomers? entry_proportions[year][lvl] = helpers.weird_division( weighted_ent_counts[year][lvl], yearly_counts[year][lvl]) # estimate the size of the professional population for years for which we only have samples estimated_pop = estimated_population_size(person_year_table, cas_sample_table, profession, sampling_year_range) # estimate year-on-year population growth estimated_pop_growth = estimated_population_growth(estimated_pop, sampling_year_range) # save to disk one table each for retirements, entries, and departures, # and one table for estimated population size and growth with open(out_dir + "retirements.csv", 'w') as out_ret: writer = csv.writer(out_ret) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "LEVEL", "PROJECTED COUNT RETIREMENTS", "SAMPLE RETIREMENT PROBABILITY" ]) for year in weighted_ret_counts: for lvl in weighted_ret_counts[year]: writer.writerow([ year, lvl, weighted_ret_counts[year][lvl], retire_probs[year][lvl] ]) with open(out_dir + "promotions.csv", 'w') as out_prom: writer = csv.writer(out_prom) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "LEVEL", "PROJECTED COUNT PROMOTIONS", "SAMPLE PROMOTION PROBABILITY" ]) for year in weighted_prom_counts: for lvl in weighted_prom_counts[year]: if lvl in weighted_prom_counts[ year] and lvl in promotion_probs[year]: writer.writerow([ year, lvl, weighted_prom_counts[year][lvl], promotion_probs[year][lvl] ]) with open(out_dir + "entries.csv", 'w') as out_ent: writer = csv.writer(out_ent) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "LEVEL", "PROJECTED COUNT ENTRIES", "SAMPLE ENTRY PROPORTIONS" ]) for year in weighted_ent_counts: for lvl in weighted_ent_counts[year]: writer.writerow([ year, lvl, weighted_ent_counts[year][lvl], entry_proportions[year][lvl] ]) with open(out_dir + "growth.csv", 'w') as out_grow: # lol writer = csv.writer(out_grow) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "PROJECTED POPULATION", "SAMPLE PERCENT GROWTH SINCE PREVIOUS YEAR" ]) for year in estimated_pop: if year == min(sorted(list(estimated_pop.keys())) ): # only know pop growth after second year writer.writerow([year, estimated_pop[year], "NA"]) else: writer.writerow( [year, estimated_pop[year], estimated_pop_growth[year]])
def make_vacancy_transition_tables(person_year_table, profession, out_dir, years, averaging_years=None, area_samp=False, out_dir_area_samp=None): """ Make a csv containing one sub-table for each of the years that we select, with each sub-table showing the transition probabilites between hiearchical levels of vacancies. Optionally, we may also include a table that averages across desired years. e.g. 1984-1989. Each sub-table should be NxN+1, where N = number of levels, and the last column represents vacancies leaving the system, i.e. people being recruited into the system. NB: diagonals signify mobility WITHIN the level :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param out_dir: str, the path to where the transition matrices will live :param years: list of ints, the years for which we want vacancy probability transition matrixes :param averaging_years: list of ints over which we want to average vacancy frequency tables, e.g. [1985, 1986, 1987] :param area_samp: bool,True if we want to sample from specific areas :param out_dir_area_samp: if given, str showing the out-directory where we want the vacancy transition tables for the sample areas to live :return: None """ averaging_years = averaging_years if averaging_years else [] # if no averaging years provided, make empty list sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) proms_weights, demos_weights, transfs_weights = None, None, None # throws up errors if things go awry # get entry counts, in easy format entry_counts = totals_in_out.pop_cohort_counts(sorted_person_year_table, years[0], years[-1], profession, cohorts=True, unit_type="nivel", entry=True) entry_counts.pop("grand_total") # don't need the grand total for lvl in entry_counts: for yr in entry_counts[lvl]: entry_counts[lvl][yr] = entry_counts[lvl][yr]["total_size"] if area_samp: # I hard code these in since they change so rarely samp_areas = {"judges": ["CA1", "CA7", "CA9", "CA12", "-88"], "prosecutors": []} samp_yr_range = {"judges": [1980, 2003], "prosecutors": []} samp_yrs, samp_as = samp_yr_range[profession], samp_areas[profession] # get sample-adjusted entry counts and sample weights for mobility entry_counts = area_samples.adjusted_entry_counts(person_year_table, profession) proms_weights = area_samples.adjusted_promotion_counts(sorted_person_year_table, profession, weights=True) demos_weights = area_samples.adjusted_demotion_counts(sorted_person_year_table, profession, weights=True) transfs_weights = area_samples.adjusted_lateral_transfer_counts(sorted_person_year_table, profession, weights=True) # restrict person-year table to sampling areas sorted_person_year_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) # redirect the out-directory out_dir = out_dir_area_samp # get person-level transition frequencies levels trans_freqs = inter_level_transition_matrices(sorted_person_year_table, profession) with open(out_dir + "vacancy_probability_transition_matrixes.csv", "w") as out_f: writer = csv.writer(out_f) # this is unused if averaging years stays empty avg_vac_trans_mat = np.empty((4, 5), float) # for each sampling year for yr in years: # make array of zeros, for four levels; not all years have four levels, but zero rows/columns are harmless trans_mat = np.zeros((4, 4)) for lvl in range(1, 5): # for departure levels in the system, i.e. the level FROM which mobility happens if lvl in trans_freqs[yr]: # if the levels exist in that year (since some are added later) # now weigh the observed values # NB: route = mobility route, e.g. "1-2" means "mobility from level 1 to level 2" for route, mob_freq in trans_freqs[yr][lvl].items(): # ignore retirements, non-movements, sums, and discontinuities if route.split("-")[1].isdigit(): # level you leave and level you go to; -1 since numpy zero indexes departing, arriving = int(route.split("-")[0]) - 1, int(route.split("-")[1]) - 1 # get frequency counts and put them in the frequency matrix; if sampling, weigh the counts if departing < arriving: # promotions trans_mat[departing][arriving] = mob_freq if area_samp: trans_mat[departing][arriving] = round(mob_freq * proms_weights[lvl], 5) if departing == arriving: # lateral transfers trans_mat[departing][arriving] = mob_freq if area_samp: trans_mat[departing][arriving] = round(mob_freq * transfs_weights[lvl], 5) if departing > arriving: # demotions trans_mat[departing][arriving] = mob_freq if area_samp: trans_mat[departing][arriving] = round(mob_freq * demos_weights[lvl], 5) # transpose the person-level mobility frequency matrix to get the vacancy mobility matrix vac_trans_mat = np.transpose(trans_mat) # by convention, we thus far treated levels in incrementing order, i.e. level 1 < 2 < 3 < 4. The convention # in vacancy chains studies is that 1 > 2 > 3 > 4, and to get that we transpose the array along the # anti-diagonal/off-diagonal vac_trans_mat = vac_trans_mat[::-1, ::-1].T # in the last column we put vacancy "retirements", i.e. entries of people into the system entry_freqs = [entry_counts[str(level)][yr] for level in range(1, 5) if str(level) in entry_counts] entries_col = np.asarray(entry_freqs[::-1])[..., None] # give it Nx1 shape; reverse order for 1 > 2 > 3... vac_trans_mat = np.append(vac_trans_mat, entries_col, 1) if yr in averaging_years: avg_vac_trans_mat = np.add(avg_vac_trans_mat, vac_trans_mat) vac_prob_mat = freq_mat_to_prob_mat(vac_trans_mat.tolist(), round_to=5) # add that transition probability matrix to table writer.writerow([profession.upper(), yr]) header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"] writer.writerow(header) for i in range(len(vac_prob_mat)): writer.writerow([header[1:][i]] + vac_prob_mat[i]) writer.writerow(["\n"]) if averaging_years: avg_vac_trans_mat = np.divide(avg_vac_trans_mat, float(len(averaging_years) - 1)) avg_vac_prob_mat = freq_mat_to_prob_mat(avg_vac_trans_mat.tolist(), round_to=5) writer.writerow(["AVERAGED ACROSS YEARS"] + averaging_years) header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"] writer.writerow(header) for i in range(len(avg_vac_prob_mat)): writer.writerow([header[1:][i]] + avg_vac_prob_mat[i])