def make_percent_pre_1990_table(person_year_table, profession, out_dir, out_dir_area_samp=None, area_sample=False): """ Make a table that shows for for every given year the percentage of people in the system who entered the system prior to 1990. This is meant to show rate of decrease of socialist-era judges. Percentages are disaggregated by judicial level, and dump them in a csv table. NB: when running this metric on the sample, this function assumes that entries and departures of pre-1990 people into the sample balance out, so that the sampling itself doesn't influence the before-to-after 1990 ratio. :param person_year_table: a table of person-years, as a list of lists; assumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești appeals areas/regions; False by default :param out_dir: str, directory where we want the non-area-sampled results table to live :param out_dir_area_samp: str, if given it's where we want the sample-area results table to live :return None """ if area_sample: appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"] # I hard code this in since it changes very rarely person_year_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample) out_dir = out_dir_area_samp # get handy column indexes yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort table by person and year, then group table by persons person_year_table = sorted(person_year_table, key=itemgetter(pid_col_idx, yr_col_idx)) people = [person for k, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))] # get the span of years years = sorted(list({int(py[yr_col_idx]) for py in person_year_table})) # initialise the nested dict holding the data, in three layers: hierarchical levels, list of counts b4_1990_year_dict = {i: {yr: {"before 1990": 0, "total count": 0} for yr in years} for i in range(1, 5)} for person in people: first_career_year = int(person[0][yr_col_idx]) for pers_yr in person: current_year = int(pers_yr[yr_col_idx]) current_level = int(pers_yr[lvl_col_idx]) b4_1990_year_dict[current_level][current_year]["total count"] += 1 if first_career_year <= 1990: b4_1990_year_dict[current_level][current_year]["before 1990"] += 1 # calculate percent from before 1990, only for 1990 and after (before 1990s it's always 100%) percs_lvl = {lvl: [] for lvl in b4_1990_year_dict} for lvl in b4_1990_year_dict: for yr in years: if yr >= 1990: percs_lvl[lvl].append(helpers.percent(b4_1990_year_dict[lvl][yr]["before 1990"], b4_1990_year_dict[lvl][yr]["total count"])) # write each level timeseries to disk with open(out_dir + "percent_pre_1990.csv", "w") as out_f: writer = csv.writer(out_f) writer.writerow(["Hierarchical Level"] + [yr for yr in years if yr >= 1990]) for lvl in b4_1990_year_dict: writer.writerow([lvl] + percs_lvl[lvl])
def percent_female(count_dict, units, unit_type=None): """ Update the percent_female value in the count_dict :param count_dict: a dictionary of counts -- for format, see function metrics_dict :param units: a set of unit categories, each a string :param unit_type: None, or string; if string, type of the unit as it appears in header of person_year_table (e.g. "camera") :return: None """ # now get percent female per cohort, and per unit if applicable for year in count_dict['grand_total']: if count_dict['grand_total'][year]['total_size'] != 0: count_dict['grand_total'][year]['percent_female'] = helpers.percent( count_dict['grand_total'][year]['f'], count_dict['grand_total'][year]['total_size']) if unit_type: for u in units: if count_dict[u][year]['total_size'] != 0: count_dict[u][year]['percent_female'] = helpers.percent( count_dict[u][year]['f'], count_dict[u][year]['total_size'])
def estimated_population_growth(estimated_pop, sampling_year_range): """ Estimate how much the population grew between any two years, expressed in percentages. :param estimated_pop: dict: keys are years, values are estimated population sizes :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004) :return: dict of estimated growth percentages: keys are years, values are percentages """ estimated_growth = {} for year in range(sampling_year_range[0] + 1, sampling_year_range[1] + 1): year_diff = estimated_pop[str(year)] - estimated_pop[str(year - 1)] year_percent_growth = helpers.percent(year_diff, estimated_pop[str(year - 1)]) estimated_growth.update({str(year): year_percent_growth}) return estimated_growth
def update_cohort_of_population(cohorts_dict, population_dict, entry=True, units=None): """ Updates the value that shows how big a yearly cohort is relative to all the people in that year. NB: for entry cohorts, we compare cohort sizes to all people in the PREVIOUS year. For exit cohorts, we compare cohort sizes to all people in the CURRENT year. :param cohorts_dict: a dictionary of cohorts, where each key is a year and values are metrics for that cohort :param population_dict: a dictionary for the whole population, where each key is a year, and values are metrics for all population members for that year :param entry: bool, True if we're getting data for entry cohorts, False if for exit cohorts :param units: a set of unique units of a certain type, e.g. towns :return: None """ for year in cohorts_dict['grand_total']: # for entry cohorts, compare with preceding year, unless it's the first year if entry and year - 1 in cohorts_dict: yearly_pop = population_dict['grand_total'][year - 1]['total_size'] else: yearly_pop = population_dict['grand_total'][year]['total_size'] if cohorts_dict['grand_total'][year]['total_size'] != 0: cohorts_dict['grand_total'][year]['chrt_prcnt_of_pop'] = helpers.percent( cohorts_dict['grand_total'][year]['total_size'], yearly_pop) if units: for u in units: # for entry cohorts, compare with preceding year, unless it's the first year if entry and year - 1 in cohorts_dict: yearly_unit_pop = population_dict[u][year - 1]['total_size'] else: yearly_unit_pop = population_dict[u][year]['total_size'] if cohorts_dict[u][year]['total_size'] != 0: cohorts_dict[u][year]['chrt_prcnt_of_pop'] = helpers.percent(cohorts_dict[u][year]['total_size'], yearly_unit_pop)
def inter_professional_transfers(multiprofs_py_table, out_dir, year_window): """ Finds possible name matches between people who retired in year X from profession A, and people who joined professions B, C... in the years from X to X+4, inclusive. In other words, if someone left a profession one year, see if in the next five years they joined any of the other professions. NB: need to choose carefully the start and end years since only for some years do we have overlap between different professions NB: this function assumes that each match will be human-checked afterwards. Consequently, it errs on the side of over-inclusion, i.e. prefers false positives. :param multiprofs_py_table: person-year table of all professions :param out_dir: directory where the log of interprofessional transition matches will live :param year_window: int, how many years after exit we look for interprofessional transition; if year_window = 0, we want only professional transfers in the exit year if year_window = 3, we want only professional transfers in the exit year and two consecutive years, e.g. 2000-2002 (the years 2000, 2001, and 2003) etc. :return: None """ # load the gender dict, we'll need this later gender_dict = gender.get_gender_dict() # get start and end year of all observations year_col_idx = helpers.get_header('all', 'combine').index('an') start_year, end_year = int(multiprofs_py_table[0][year_col_idx]), int( multiprofs_py_table[-1][year_col_idx]) # initialise a list/log of matches/putative cross-professional transfers, so we can eyeball for errors transfer_match_log = [] # for each profession get the first and last observation years and the full names of yearly entry and exit cohorts professions_data = professions_yearspans_cohorts(multiprofs_py_table, combined=True) # make dict with level 1 key is year, level 2 key is sending profession, level 3 key is receiving profession; # level 4 dict holds counts: total count transfers from profession A to profession B in year X, # count women of those, percent women of those transfers_dict = {} measures = { 'total transfers': 0, 'women transfers': 0, 'percent women transfers': 0 } for exit_year in range(start_year, end_year): # the first-level key is the row/sender, the second-level key is the column/receiver professions_dict = { prof: {prof: deepcopy(measures) for prof in professions_data} for prof in professions_data } transfers_dict.update({exit_year: professions_dict}) # for each profession for sending_profession in professions_data: # for each yearly exit cohort for exit_year, names in professions_data[sending_profession][ 'exit'].items(): # get set of entrants to OTHER professions, from exit year to year + year_window; e.g. [2000-2002] other_profs_entrants = other_professions_entrants( sending_profession, professions_data, exit_year, year_window) for exitee_name in names: # look for name match in set of entrants into other professions, in the specified time window for entrant in other_profs_entrants: entrant_name, entry_year, entry_profession = entrant[ 0], entrant[1], entrant[2] # if names match if name_match(exitee_name, entrant_name): # add match to log for visual inspection transfer_match_log.append([ exitee_name, exit_year, sending_profession, '', entrant_name, entry_year, entry_profession ]) # increment value of total counts in the transfer dict transfers_dict[exit_year][sending_profession][ entry_profession]['total transfers'] += 1 # check if exitee name is female, if yes increment appropriate count in transfer dict exitee_given_names = exitee_name.split(' | ')[1] if gender.get_gender(exitee_given_names, exitee_name, gender_dict) == 'f': transfers_dict[exit_year][sending_profession][ entry_profession]['women transfers'] += 1 # for that year get percent female transfers for prof in professions_data: n = transfers_dict[exit_year][sending_profession][prof][ 'women transfers'] d = transfers_dict[exit_year][sending_profession][prof][ 'total transfers'] transfers_dict[exit_year][sending_profession][prof][ 'percent women transfers'] = helpers.percent(n, d) # write the match list log to disk for visual inspection log_out_path = out_dir + 'interprofessional_transitions_' + str( year_window) + '_year_window_match_list_log.csv' with open(log_out_path, 'w') as out_p: writer = csv.writer(out_p) writer.writerow([ "EXITEE NAME", "EXIT YEAR", "EXIT PROFESSION", "", "ENTRANT NAME", "ENTRY YEAR", "ENTRANT PROFESSION" ]) for match in sorted(transfer_match_log, key=itemgetter(1)): # sorted by exit year writer.writerow(match) return transfers_dict
def prof_inherit_table(out_dir, person_year_table, profession, year_window=1000, num_top_names=0, multi_name_robustness=False): """ Puts the profession inheritance dict in a table, adding some pecentages and sums. Output table has header "YEAR", "MALE ENTRIES", "FEMALE ENTRIES", "TOTAL ENTRIES", "MALE INHERITANCE COUNT", "FEMALE INHERITANCE COUNT", "TOTAL INHERITANCE COUNT", "MALE INHERITANCE PERCENT", "FEMALE INHERITANCE PERCENT", "TOTAL INHERITANCE PERCENT" :param out_dir: directory where the inheritance table will live :param person_year_table: a table of person years as a list of lists :param year_window: int, how far back you want to look for inheritance; e.g. year_window == 4, we look four years back, so if in 2004, we look back to 2000 (inclusive); Default is 1000, i.e. look at all years :param num_top_names: int, the number of top most frequent surnames that we consider the set of "common" surnames, e.g. if num_top_names == 10, the ten surnames with the most associated people are considered the "most common" surnames; Default is zero, i.e. no names are common :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param multi_name_robustness: bool, True if we're running the multi-name robustness check :return: None """ # get the inheritance dict inheritance_dict = profession_inheritance( out_dir, person_year_table, profession, year_window, num_top_names, multi_name_robustness=multi_name_robustness) sum_male_entries, sum_female_entries = 0, 0 sum_male_inherit, sum_female_inherit = 0, 0 if multi_name_robustness: table_out_path = out_dir + '/' + profession + '_MN_ROBUST' + '_exclude_surnames_above_rank_' \ + str(num_top_names) + '_inheritance_table.csv' else: table_out_path = out_dir + '/' + profession + '_exclude_surnames_above_rank_' + str(num_top_names) \ + '_inheritance_table.csv' with open(table_out_path, 'w') as out_p: writer = csv.writer(out_p) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "MALE ENTRIES", "FEMALE ENTRIES", "TOTAL ENTRIES", "MALE INHERITANCE COUNT", "FEMALE INHERITANCE COUNT", "TOTAL INHERITANCE COUNT", "MALE INHERITANCE PERCENT", "FEMALE INHERITANCE PERCENT", "TOTAL INHERITANCE PERCENT" ]) # for each year in the inheritance dict for year, counts in inheritance_dict.items(): # increment counters sum_male_entries += counts["male entrants"] sum_female_entries += counts["female entrants"] sum_male_inherit += counts["male inherit"] sum_female_inherit += counts["female inherit"] # get sums and percentages total_entries = counts["female entrants"] + counts["male entrants"] total_inherit = counts["female inherit"] + counts["male inherit"] female_inherit_percent = helpers.percent(counts["female inherit"], counts["female entrants"]) male_inherit_percent = helpers.percent(counts["male inherit"], counts["male entrants"]) total_inherit_percent = helpers.percent(total_inherit, total_entries) writer.writerow([ year, counts["male entrants"], counts["female entrants"], total_entries, counts["male inherit"], counts["female inherit"], total_inherit, male_inherit_percent, female_inherit_percent, total_inherit_percent ]) global_percent_male_inherit = helpers.percent(sum_male_inherit, sum_male_entries) global_percent_female_inherit = helpers.percent( sum_female_inherit, sum_female_entries) global_percent_total_inherit = helpers.percent( sum_male_inherit + sum_female_inherit, sum_male_entries + sum_female_entries) writer.writerow([ "GLOBAL", sum_male_entries, sum_female_entries, sum_male_entries + sum_female_entries, sum_male_inherit, sum_female_inherit, sum_male_inherit + sum_female_inherit, global_percent_male_inherit, global_percent_female_inherit, global_percent_total_inherit ])
def hierarchical_mobility(person_year_table, profession): """ Finds how many people, each year, moved up, down, or across (i.e. between geographic units in the same level) from their level in the judicial hierarchy, deaggregating mobility by gender. The levels are {1: low court, 2: tribunal, 3: appellate court, 4: high court}. The output dict has the following format: {"year": { "level1" : { "up": {"m": int, "f": int, "dk": int, "total": int, "percent female": int}, "down": {"m": int, "f": int, "dk": int, "total": int, "percent female": int}, "across": {"m": int, "f": int, "dk": int, "total": int, "percent female": int} }, "level2": { "up": {"m": int, "f": int, "dk": int, "total": int, "percent female": int}, ... }, ... }, "year2" ... } NB: "m" = male, "f" = "female", "dk" = "don't know". NB: there is no "down" for low courts, or "up" and "across" for the high court. NB: data on retirements ("out") come via exit cohorts from the function "pop_cohort_counts". NB: only judges and prosecutors have a hierarchical system -- this function is not sensical for notaries, executori, and lawyers. :param person_year_table: a table of person-years, as a list of lists :param profession: string, "judges", "prosecutors", "notaries" or "executori". :return: a dict of mobility info """ # get column indexes pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') gender_col_idx = helpers.get_header(profession, 'preprocess').index('sex') year_col_idx = helpers.get_header(profession, 'preprocess').index('an') level_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') jud_col_idx = helpers.get_header(profession, 'preprocess').index('jud cod') trib_col_idx = helpers.get_header(profession, 'preprocess').index('trib cod') ca_col_idx = helpers.get_header(profession, 'preprocess').index('ca cod') # get the year range and set the mobility types years = list(sorted({py[year_col_idx] for py in person_year_table})) mobility_types = ["across", "down", "up"] # initialise the mobility dict mob_dict = {year: {lvl: {mob_type: {"m": 0, "f": 0, "dk": 0, "total": 0, "percent female": 0} for mob_type in mobility_types} for lvl in range(1, 5)} for year in years} # group the person-year table by unique person IDs, i.e. by people person_year_table.sort(key=itemgetter(pid_col_idx, year_col_idx)) # sort by person ID and year people = [person for key, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))] # fill in the mobility dict for pers in people: gend = pers[0][gender_col_idx] for idx, pers_year in enumerate(pers): # by convention we say there's mobility in this year if next year's location is different if idx < len(pers) - 1: year, level = pers_year[year_col_idx], int(pers_year[level_col_idx]) if level < int(pers[idx + 1][level_col_idx]): mob_dict[year][level]["up"][gend] += 1 elif level > int(pers[idx + 1][level_col_idx]): mob_dict[year][level]["down"][gend] += 1 else: # need to compare this year and next year's unit to see if they moved laterally # each unit is uniquely identified by it's three-level hierarchical code current_unit = '|'.join([pers_year[jud_col_idx], pers_year[trib_col_idx], pers_year[ca_col_idx]]) next_unit = '|'.join( [pers[idx + 1][jud_col_idx], pers[idx + 1][trib_col_idx], pers[idx + 1][ca_col_idx]]) if current_unit != next_unit: mob_dict[year][level]["across"][gend] += 1 # update the aggregate values for year, levels in mob_dict.items(): for lvl, mobility_type in levels.items(): for mob in mobility_type: mob_dict[year][lvl][mob]["total"] = sum([mob_dict[year][lvl][mob]["m"], mob_dict[year][lvl][mob]["f"], mob_dict[year][lvl][mob]["dk"]]) mob_dict[year][lvl][mob]["percent female"] = helpers.percent(mob_dict[year][lvl][mob]["f"], mob_dict[year][lvl][mob]["total"]) return mob_dict
def career_climbings(person_year_table, profession, use_cohorts, first_x_years): """ Return a dict of metrics on career climbing, i.e. of moving up the judicial hierarchy. NB: these metrics are only for a subset of observations, namely those specified by use_cohorts. The purpose of this feature is to help us avoid years with rotten data, while giving us a big enough time interval to catch movement up two levels We want two pieces of information: a) total counts and % female of those who stay in low courts, climb to tribunals, and climb to appellate courts b) average time it took to climb, whether to tribunal or appellate court, for those cohort members who climbed to those levels :param person_year_table: a table of person-years, as a list of lists :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param use_cohorts: list of ints, each int represents a year for which you analyse entry cohorts, e.g. [2006, 2007] :param first_x_years: int, the number of years from start of career that we condsider, e.g. ten years since entry :return: """ # get column indexes pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') year_col_idx = helpers.get_header(profession, 'preprocess').index('an') gender_col_idx = helpers.get_header(profession, 'preprocess').index('sex') # sort by unique person ID and year, then group by person-year person_year_table.sort(key=itemgetter(pid_col_idx, year_col_idx)) people = [person for key, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))] # initialise dict that breaks down careers by how high they climbed counts_dict = {'m': 0, 'f': 0, 'dk': 0, 'total': 0, 'percent female': 0, 'avrg yrs to promotion': 0} levels = ['low court', 'tribunal', 'appellate', 'high court'] careers_by_levels = {lvl: {'career type table': [], 'counts dict': deepcopy(counts_dict)} for lvl in levels} fill_careers_by_levels_dict(people, profession, use_cohorts, careers_by_levels) # for each career type get basic descriptives for step, info in careers_by_levels.items(): times_to_promotion = [] for pers in info['career type table']: gend = pers[0][gender_col_idx] # see time it takes to climb hierarchy; use only first X years of career, to make comparable # careers of different total length t_to_promotion = time_to_promotion(pers, profession, step, first_x_years) # if person jumped seniority requirements (e.g. came from different legal profession), or has > ten years # (this is an error, since time_to_promotion should only keep first ten years), ignore if t_to_promotion == 'NA': # catches low court people info['counts dict'][gend] += 1 else: # t_to_promotion != 'NA', i.e. everyone else if min_time_promotion(step) <= t_to_promotion < 11: times_to_promotion.append(t_to_promotion) # save time to promotion info['counts dict'][gend] += 1 info['counts dict']['total'] = info['counts dict']['f'] + info['counts dict']['m'] + info['counts dict']['dk'] info['counts dict']['percent female'] = helpers.percent(info['counts dict']['f'], info['counts dict']['total']) info['counts dict']['avrg yrs to promotion'] = 'NA' if 'NA' in times_to_promotion or times_to_promotion == [] \ else round(statistics.mean(times_to_promotion)) return careers_by_levels