def make_percent_pre_1990_table(person_year_table, profession, out_dir, out_dir_area_samp=None, area_sample=False):
    """
    Make a table that shows for for every given year the percentage of people in the system who entered the system prior
    to 1990. This is meant to show rate of decrease of socialist-era judges. Percentages are disaggregated by judicial
    level, and dump them in a csv table.

    NB: when running this metric on the sample, this function assumes that entries and departures of pre-1990 people
        into the sample balance out, so that the sampling itself doesn't influence the before-to-after 1990 ratio.

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești
                        appeals areas/regions; False by default
    :param out_dir: str, directory where we want the non-area-sampled results table to live
    :param out_dir_area_samp: str, if given it's where we want the sample-area results table to live
    :return None
    """
    if area_sample:
        appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]  # I hard code this in since it changes very rarely
        person_year_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample)
        out_dir = out_dir_area_samp

    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort table by person and year, then group table by persons
    person_year_table = sorted(person_year_table, key=itemgetter(pid_col_idx, yr_col_idx))
    people = [person for k, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))]

    # get the span of years
    years = sorted(list({int(py[yr_col_idx]) for py in person_year_table}))

    # initialise the nested dict holding the data, in three layers: hierarchical levels, list of counts
    b4_1990_year_dict = {i: {yr: {"before 1990": 0, "total count": 0} for yr in years} for i in range(1, 5)}

    for person in people:
        first_career_year = int(person[0][yr_col_idx])
        for pers_yr in person:
            current_year = int(pers_yr[yr_col_idx])
            current_level = int(pers_yr[lvl_col_idx])
            b4_1990_year_dict[current_level][current_year]["total count"] += 1
            if first_career_year <= 1990:
                b4_1990_year_dict[current_level][current_year]["before 1990"] += 1

    # calculate percent from before 1990, only for 1990 and after (before 1990s it's always 100%)
    percs_lvl = {lvl: [] for lvl in b4_1990_year_dict}
    for lvl in b4_1990_year_dict:
        for yr in years:
            if yr >= 1990:
                percs_lvl[lvl].append(helpers.percent(b4_1990_year_dict[lvl][yr]["before 1990"],
                                                      b4_1990_year_dict[lvl][yr]["total count"]))

    # write each level timeseries to disk
    with open(out_dir + "percent_pre_1990.csv", "w") as out_f:
        writer = csv.writer(out_f)
        writer.writerow(["Hierarchical Level"] + [yr for yr in years if yr >= 1990])
        for lvl in b4_1990_year_dict:
            writer.writerow([lvl] + percs_lvl[lvl])
def fill_careers_by_levels_dict(people, profession, use_cohorts, career_types_dict):
    """
    Update a career types dict (form given in first part of function "career stars").

    :param people: a list of persons, where each "person" is a list of person years (each itself of list) that share
                   a unique person-level ID
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param use_cohorts: list of ints, each int represents a year for which you analyse entry cohorts, e.g. [2006, 2007]
    :param career_types_dict: a layered dict (form given in first part of function "career stars")
    :return: None
    """

    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    level_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    for person in people:
        entry_year = int(person[0][year_col_idx])  # get their entry year
        entry_level = int(person[0][level_col_idx])  # some people start higher because before they were e.g. lawyers
        levels = {int(person_year[level_col_idx]) for person_year in person}  # see what levels they've been in

        # keep only people from specified entry cohorts who started at first level, i.e. no career jumpers
        if entry_year in use_cohorts and entry_level == 1:
            if 4 in levels:
                career_types_dict['high court']['career type table'].append(person)
            elif 3 in levels:
                career_types_dict['appellate']['career type table'].append(person)
            elif 2 in levels:
                career_types_dict['tribunal']['career type table'].append(person)
            else:
                career_types_dict['low court']['career type table'].append(person)
def time_to_promotion(person, profession, level, first_x_years):
    """
    Given a career level, find how long (i.e. how many person years) it took to get there.

    :param person: a list of person years that share a unique person ID
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param level: string, 'tribunal', 'appellate', or 'high court', indicating position in judicial hierarchy
    :param first_x_years: int, how many years after start of career we consider, e.g. ten years after joing profession
    :return: t_to_promotion, int, how long (in years) it took to get promoted
    """

    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    level_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # see how long it takes them to get a promotion; compare only first X years of everyone's career
    t_to_promotion = 'NA'
    entry_year = int(person[0][year_col_idx])

    if level == 'tribunal':  # count how many years they were at low court
        t_to_promotion = len([pers_year for pers_year in person if int(pers_year[level_col_idx]) == 1
                              and int(pers_year[year_col_idx]) < entry_year + first_x_years])

    if level == 'appellate':  # count how many year they were at low court or tribunal, i.e. not at appellate
        t_to_promotion = len([pers_year for pers_year in person if (int(pers_year[level_col_idx]) == 1
                                                                    or int(pers_year[level_col_idx]) == 2)
                              and int(pers_year[year_col_idx]) < entry_year + first_x_years])

    if level == 'high court':  # count how many years they were at low court
        t_to_promotion = len([pers_year for pers_year in person if int(pers_year[level_col_idx]) != 4
                              and int(pers_year[year_col_idx]) < entry_year + first_x_years])

    return t_to_promotion
def pop_cohort_counts(person_year_table, start_year, end_year, profession, cohorts=True, unit_type=None, entry=True):
    """
    For each year in the range from start_year to end_year, return a dict of counts of women, men, don't knows,
    cohort size, and percent women for that cohort.

    If units are provided (e.g. geographic areas) it calculates the metrics per each unit, so e.g. cohort size per
    year, for each geographic area.

    NB: there can be entry cohorts (those that joined the profession in year X) and exit cohorts (those that left
    the profession in year Y).

    :param person_year_table: a table of person-years, as a list of lists
    :param start_year: int, year we start looking at
    :param end_year: int, year we stop looking
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param cohorts: bool, True if we want counts for entry and exit cohorts (e.g. all who entered the profession in 
                          2012), False if we want counts for whole population (e.g. all professionals in 2012)
    :param unit_type: None, or string; if string, type of the unit as it appears in header of person_year_table
                      (e.g. "camera")
    :param entry: bool, True if we're getting data for entry cohorts, False if for exit cohorts
    :return: a dict of years, where each value is a dict with gender metrics
    """

    pop_counts = {'grand_total': metrics_dict(start_year, end_year)}
    units = None

    # if we have units, initialise a dict of years for each unit
    if unit_type:
        unit_col_idx = helpers.get_header(profession, 'preprocess').index(unit_type)
        units = {person_year[unit_col_idx] for person_year in person_year_table}
        pop_counts.update({unit: metrics_dict(start_year, end_year) for unit in natsort.natsorted(list(units))})

    # make an identical dict for cohorts
    cohort_counts = deepcopy(pop_counts)

    # get total counts
    for person_year in person_year_table:
        update_size_gender(pop_counts, person_year, start_year, end_year, profession, units, unit_type=unit_type)
    percent_female(pop_counts, units, unit_type=unit_type)

    # then get cohort counts

    # sort table by people and year, then group person-years by person
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    person_year_table.sort(key=itemgetter(pid_col_idx, yr_col_idx))
    people = [person for k, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))]
    for person in people:
        # if describing entry cohorts we want the first person-year, else the last person-year (i.e. exit cohorts)
        edge_person_year = person[0] if entry else person[-1]
        update_size_gender(cohort_counts, edge_person_year, start_year, end_year, profession,
                           units, unit_type=unit_type)
    percent_female(cohort_counts, units, unit_type=unit_type)
    update_cohort_of_population(cohort_counts, pop_counts, entry=entry, units=units)

    return cohort_counts if cohorts else pop_counts
def top_surnames(person_year_table, top_size, profession):
    """
    Return a set of surnames that are at among the N most frequent, where top_size = N.
    e.g. if top_size = 3, we return surnames that are the most frequent in the population (e.g. "SMITH"),
    the second most-frequent, and the third most frequent. If there are multiple names tied for a certain frequency
    (e.g. SMITH and JONES both equally frequent on number one) then it returns all these (tied) names.

    :param person_year_table: a table of person-years, as a list of lists
    :param top_size: int, number of top names we want to return
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: a set of top-ranked surnames
    """

    # let us know what profession we're on
    print(profession.upper())
    print('  SURNAME FREQUENCIES')

    # make dict of surnames
    surname_col_idx = helpers.get_header(profession,
                                         'preprocess').index('nume')
    surnames = {}
    for person_year in person_year_table:
        for sn in person_year[surname_col_idx].split():
            surnames.update({sn: 0})

    # count the frequency of each surname; each new person that has that name adds one
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    persons = [
        person for key, [*person] in itertools.groupby(
            sorted(person_year_table, key=itemgetter(pid_col_idx)),
            key=itemgetter(pid_col_idx))
    ]
    for pers in persons:
        p_sns = pers[0][surname_col_idx].split(
        )  # all person-years have same surnames, just use first year
        for sn in p_sns:
            surnames[sn] += 1

    # now make a new dict where keys are frequencies and values are lists of name that have those frequencies
    max_freq = max(list(surnames.values()))
    freq_dict = {i: [] for i in range(1, max_freq + 1)}  # initialise the dict
    [freq_dict[freq].append(sn) for sn, freq in surnames.items()]  # fill it
    freq_dict = {k: v
                 for k, v in freq_dict.items()
                 if v}  # throw out frequencies with no associated names

    # return a set of the top N names (as defined by top_size), and print what the top N are, so we can judge visually
    top_freqs = sorted(list(freq_dict))[-top_size:]
    top_sns = set()
    for i in top_freqs:
        print('    freq: ' + str(i) + ' ; surnames: ', freq_dict[i])
        for sn in freq_dict[i]:
            top_sns.add(sn)
    # if top size is zero, defined as "there are no top names"
    return top_sns if top_size != 0 else []
Example #6
0
def professions_yearspans_cohorts(multiprofessional_person_year_table,
                                  combined=False):
    """
    Given a multiprofessional year table, returns a dict of this form

    {'profession':
        {'start year': int, first observed year for profession
         'end year': int, last observed year for profesion
         ' entry': {year1: list of entry cohort names for year1, year1: list of entry cohort names for year1,...}
         'exit': {year1: list of entry cohort names for year1, year2: list of entry cohort names for year2,...}
         }
    }

    :param multiprofessional_person_year_table: a person-year table that covers multiple professions
    :param combined: bool, True if we're dealing with the table of combined professions
    :return: a dict of data on each profession
    """
    # sort the table by profession and by year
    prof_col_idx = helpers.get_header('all', 'combine').index('profesie')
    year_col_idx = helpers.get_header('all', 'combine').index('an')
    multiprofessional_person_year_table.sort(
        key=itemgetter(prof_col_idx, year_col_idx))

    # make four separate subtables by profession
    professions = [[*prof] for key, prof in itertools.groupby(
        multiprofessional_person_year_table, key=itemgetter(prof_col_idx))]
    data_dict = {}
    for p in professions:
        prof_name = p[0][prof_col_idx]
        start_year, end_year = int(p[0][year_col_idx]), int(
            p[-1][year_col_idx])
        # NB: +1 to entry year to ignore left censor (when all enter),
        # and -1 to exit year to ignore right censor (when all leave)
        entry_cohorts = helpers.cohort_name_lists(p,
                                                  start_year + 1,
                                                  end_year,
                                                  p,
                                                  entry=True,
                                                  combined=combined)
        exit_cohorts = helpers.cohort_name_lists(p,
                                                 start_year,
                                                 end_year - 1,
                                                 p,
                                                 entry=False,
                                                 combined=combined)

        data_dict.update({
            prof_name: {
                'start year': start_year,
                'end year': end_year,
                'entry': entry_cohorts,
                'exit': exit_cohorts
            }
        })
    return data_dict
def get_person_measures(pers, profession, right_censor_year):
    """
    Given a person's collection of person years, returns a dict with:
     - number of regional moves in first five years of career
     - number of regional moves in first ten years of career
     - number of regional moves in all career
     - person's hierarchical state sequence
     - person's sequence of relative appellate region moves

    :param pers: list of person-years sharing a unique person-ID
    :param profession: string, "judges", "prosecutors"
    :param right_censor_year: int, year in which we stop observing
    :return: dict
    """

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_col_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the person by year
    pers.sort(key=operator.itemgetter(yr_col_idx))

    # get the sequence of hierarchical states
    level_codes_dict = {"1": "LC", "2": "TB", "3": "CA", "4": "HC"}
    hierar_seq = [level_codes_dict[pers_yr[lvl_col_idx]] for pers_yr in pers]

    # now get the sequence of relative region moves
    # first get a dict where region codes are keys and the order of that region's first appearance is the value
    rel_reg_dict = {}
    counter = 1
    for pers_yr in pers:
        if pers_yr[ca_col_idx] not in rel_reg_dict:
            rel_reg_dict[pers_yr[ca_col_idx]] = counter
            counter += 1
    rel_reg_seq = [str(rel_reg_dict[pers_yr[ca_col_idx]]) for pers_yr in pers]

    # the number of regional moves is simply the number of distinct spells in the relative region sequence
    # need to substract 1 since no moving still generates one entry, hence one group
    num_reg_moves_first_five = len([k for k, g in itertools.groupby(rel_reg_seq[:5])]) - 1
    num_reg_moves_first_ten = len([k for k, g in itertools.groupby(rel_reg_seq[:10])]) - 1
    num_reg_moves_total = len([k for k, g in itertools.groupby(rel_reg_seq)]) - 1

    # get times to events
    time_to_ret = len(pers) if pers[-1][yr_col_idx] != right_censor_year else None
    time_to_tb = hierar_seq.index("TB") if "TB" in hierar_seq else None
    time_to_ca = hierar_seq.index("CA") if "CA" in hierar_seq else None
    time_to_hc = hierar_seq.index("HC") if "HC" in hierar_seq else None
    time_to_first_geog_move = next((i for i in range(1, len(rel_reg_seq)) if rel_reg_seq[i] != rel_reg_seq[i - 1]),
                                   None)

    return {"time to ret": time_to_ret, "time to tb": time_to_tb, "time to ca": time_to_ca, "time to hc": time_to_hc,
            "time to first geog move": time_to_first_geog_move, "num reg moves first 5": num_reg_moves_first_five,
            "num reg moves first 10": num_reg_moves_first_ten, "num reg moves total": num_reg_moves_total,
            "hierar seq": hierar_seq, "rel reg seq": rel_reg_seq}
Example #8
0
def subset_data(full_data_table_path, subset_data_table_path, profession):
    """
    Select a portion of the data to feed into the hazard models, focusing on the desired time-period and make it
    possible to run regressions with person-level fixed effects (which means that everyone must have at least two
    observations). Normally this would be done in R, but I'm bad at R subsetting.

    I want the subset table on disk so humans can inspect it.

    :param full_data_table_path: str, path to the full data table
    :param subset_data_table_path: str, path to where we want the subset table to live
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :return: None
    """

    # get handy column headers
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # load the data
    with open(full_data_table_path, 'r') as in_f:
        full_py_table = list(csv.reader(in_f))
    # initialise subset table, with header
    subset_table = [full_py_table[0]]

    # only keep years between 2006 and 2019, inclusive,
    # and throw out all observations on the high court, i.e. level 4, too few to analyse statistically; skip header
    filtered_table = [
        py for py in full_py_table[1:]
        if 2005 < int(py[yr_col_idx]) < 2020 and py[lvl_col_idx] != "4"
    ]

    # sort table by unique person ID and year then group it by persons
    filtered_table = sorted(filtered_table,
                            key=operator.itemgetter(pid_col_idx, yr_col_idx))
    people = [
        person for k, [*person] in itertools.groupby(
            filtered_table, key=operator.itemgetter(pid_col_idx))
    ]

    # only keep people with minimum two-year careers
    for person in people:
        # if switch is on, skip people with only one person-year observed
        if len(person) < 2:
            continue
        else:
            subset_table.extend(person)

    # write subset table to disk
    with open(subset_data_table_path, 'w') as out_f:
        writer = csv.writer(out_f)
        [writer.writerow(person_year) for person_year in subset_table]
def retire(person_year_table, profession):
    """
    Adds a column called "retire" with a "1" in the last year of a career, and a zero if it's not the last year,
    or if the observation is censored.

    NB: assumes that person-year table is already presorted by unique person ID and year

    :param person_year_table: a table of person-years, as a list of lists; comes with header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: the augmented person-year table
    """
    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')

    # save original header
    header = person_year_table[0]

    # sort table by unique person ID and year; skip header
    person_year_table = sorted(person_year_table[1:],
                               key=operator.itemgetter(pid_col_idx,
                                                       yr_col_idx))

    # initialise the new table with updated header
    table_with_retire_col = [header + ["retire"]]

    # get right censor year
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    right_censor_yr = person_year_table[-1][yr_col_idx]

    # group table by persons
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    people = [
        person
        for k, [*person] in groupby(person_year_table,
                                    key=operator.itemgetter(pid_col_idx))
    ]

    # iterate through people, adding the "retire" value in the last column and extending the updated table
    for person in people:
        new_pers_data = [pers_yr + ["0"] for pers_yr in person]

        # if last year of the career is not right censor year, change that value to "1", i.e. mark retirement
        if person[-1][yr_col_idx] != right_censor_yr:
            new_pers_data[-1][-1] = "1"

        # add the updated person-years to person-period table with the retirement column
        table_with_retire_col.extend(new_pers_data)

    return table_with_retire_col
def add_inheritance_status(person_period_table, profession, year_window=1000):
    """
    Add column indicating whether or not an individual (i.e. at the person-ID level) inherited their profession.

    :param person_period_table: a person-period table, as a list of lists
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param year_window: int, how many years back we look for matches, e.g. "6" means we look for matches in six years
                        prior to your joining the profession; default is "1000", i.e. look back to beginning of data
    :return: a person-period table (as list of lists) with an column for workplace profile inheritor status
    """

    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')

    # get the set of inheritors
    inheritor_set = inheritance.profession_inheritance(person_period_table, profession, year_window=year_window)

    # initialise new person-period table with inheritance values
    ppt_with_inher = []

    # now append the inheritance value/column to each person-period
    for pers_year in person_period_table:
        ppt_with_inher.append(pers_year + [1]) if pers_year[pid_col_idx] in inheritor_set \
            else ppt_with_inher.append(pers_year + [0])

    return ppt_with_inher
def cohort_name_lists(person_year_table,
                      start_year,
                      end_year,
                      profession,
                      entry=True,
                      combined=False):
    """
    For each year in the range from start_year to end_year, return a list of full-names of the people that joined
    the profession in that year.

    :param person_year_table: a table of person-years, as a list of lists
    :param start_year: int, year we start looking at
    :param end_year: int, year we stop looking
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param entry: bool, True if we're getting data for entry cohorts, False if for exit cohorts
    :param combined: bool, True if we're dealing with the table of combined professions
    :return: a dict of years, where each value is a list of full-name tuples of the people who joined the profession
             that year
    """
    stage = 'preprocess'
    if combined:
        profession, stage = 'all', 'combine'

    pid_col_idx = helpers.get_header(profession, stage).index('cod persoană')
    year_col_idx = helpers.get_header(profession, stage).index('an')
    surname_col_idx = helpers.get_header(profession, stage).index('nume')
    given_name_col_idx = helpers.get_header(profession, stage).index('prenume')

    # make a dict, key = year, value = empty list
    cohorts = {year: [] for year in range(start_year, end_year + 1)}
    # group by people
    people = [
        person for k, [*person] in itertools.groupby(
            sorted(person_year_table, key=itemgetter(pid_col_idx)),
            key=itemgetter(pid_col_idx))
    ]

    # append the full name of the first year of each person to its cohort
    for person in people:
        edge_person_year = person[0] if entry else person[-1]

        if start_year <= int(edge_person_year[year_col_idx]) <= end_year:
            cohorts[int(edge_person_year[year_col_idx])].append(
                edge_person_year[surname_col_idx] + ' | ' +
                edge_person_year[given_name_col_idx])

    return cohorts
def people_in_prior_years(current_year, first_year, person_year_table,
                          year_window, profession):
    """
    Make a list of all people (NOT person years) that appear in the year window prior to the current year.
    The list contains the LAST person-year that we see for that person within the time window
    e.g. if they left three years before the current year, we see their info for three years ago
    if they haven't left yet, we see their info for last year

    NB this matches based on the other person's last location, which is a simplifying heuristic

    :param current_year: int, the current year
    :param first_year: int, first year in the whole person-period table
    :param person_year_table: a table of person years, as a list of lists
    :param year_window: int, how far back we want to look; e.g. if year_window = 3 and current_year = 2008, the
                        time window in which we look is 2005 to 2008.
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: a list person years, one per person
    """

    # get column indexes
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')

    # set the window in which we look or names
    min_year = max(current_year - year_window,
                   first_year)  # prevents us from going under bounds
    window = list(range(min_year, current_year))

    # get all the person years before current year
    pys_in_window = [
        py for py in person_year_table if int(py[year_col_idx]) in window
    ]

    # sort by person-ID and year, groupby person-ID
    pys_in_window.sort(key=itemgetter(pid_col_idx, year_col_idx))
    persons = [
        person
        for key, [*person] in itertools.groupby(pys_in_window,
                                                key=itemgetter(pid_col_idx))
    ]
    # return a list with last observation of each person (where each person is list of person years with a common PID)
    return [pys_by_pers[-1] for pys_by_pers in persons]
def write_preprocessed_to_disk(person_year_table, out_path, profession):
    """
    Write a table of preprocessed person-years to disk.

    :param person_year_table: a table of person-years, as a list of lists
    :param out_path: where to write the preprocessed table
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return:
    """
    with open(out_path, 'w') as out_file:
        writer = csv.writer(out_file)
        writer.writerow(helpers.get_header(profession, 'preprocess'))  # put in header
        [writer.writerow(row) for row in sorted(person_year_table, key=operator.itemgetter(1))]  # sort by unique ID
def mo_yr_sample(person_month_table, profession, months, years):
    """
    Sample only the person-months from the specified months in the specified years. For the example values below,
    we would sample person-years from April, July, and December of 2006, 2007, and 2008.

    NB: works only on the "collected" table

    :param person_month_table: table of person-months, as a list of lists
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param months: iterable of ints for months (1-12), e.g. [4 ,7, 12]
    :param years: iterable of ints for years, e.g. [2006, 2007, 2008]
    :return: a person-month table with observations only from the specified months and years
    """
    months, years = set(months), set(years)
    mon_idx = helpers.get_header(profession, 'collect').index('lună')
    year_idx = helpers.get_header(profession, 'collect').index('an')

    # initialise sampled person-month table
    sampled_pm_table = []

    for pm in person_month_table:
        if int(pm[mon_idx]) in months and int(pm[year_idx]) in years:
            sampled_pm_table.append(pm)
    return sampled_pm_table
def appellate_area_sample(person_year_table, profession, appellate_codes):
    """
    Samples person-years whose appellate codes match those given.

    NB: for judges and prosecutors only!

    :param person_year_table: a table of person-years, as a list of lists
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param appellate_codes: list, strings of appellate codes we wish to sample, e.g. ["CA3", "CA7"]
    :return: a table of person-periods from only the specified appellate areas
    """
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    sampled_person_year_table = []
    for pers_yr in person_year_table:
        if pers_yr[ca_cod_idx] in set(appellate_codes):
            sampled_person_year_table.append(pers_yr)
    return sampled_person_year_table
def update_kin_match_log(kin_match_log, py, person_already, full_name,
                         current_year, profession):
    """
    Updates the log of kin matches, which we keep for later visual inspection.

    The format for the log table should be:
      - left columns: the recruit's full name, year, chamber, and town
      -right columns: same info for the person recruit is kin matched with

    :param kin_match_log: list used to keep track of kin matches, for later visual inspection
    :param py: person-year, list of data fields (order given in helpers.get_headers,
            for "preprocess" and particular profession)
    :param person_already: person-year row of person who was in the profession before the recruit
    :param full_name: recruit's full name
    :param current_year: year in which the recruit joined
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: None
    """

    # get column indexes
    surname_col_idx = helpers.get_header(profession,
                                         'preprocess').index('nume')
    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    given_name_col_idx = helpers.get_header(profession,
                                            'preprocess').index('prenume')

    if profession in {'notaries', 'executori'}:
        area_col_idx = helpers.get_header(profession,
                                          'preprocess').index('camera')
        sub_area = helpers.get_header(profession, 'preprocess').index(
            'localitatea')  # sub_area = town
    else:  # profession in {'prosecutors', 'judges'}
        area_col_idx = helpers.get_header(profession,
                                          'preprocess').index('trib cod')
        sub_area = helpers.get_header(profession, 'preprocess').index(
            'jud cod')  # sub_area = court/parquet

    rec_area, rec_sub_area = py[area_col_idx], py[sub_area]

    p_alrdy_fn = person_already[surname_col_idx] + ' | ' + person_already[
        given_name_col_idx]
    p_alrdy_chamb, p_alrdy_town = person_already[area_col_idx], person_already[
        sub_area]
    p_alrdy_year = person_already[year_col_idx]

    kin_match_log.append(
        [full_name, current_year, rec_area, rec_sub_area] + [''] +
        [p_alrdy_fn, p_alrdy_year, p_alrdy_chamb, p_alrdy_town])
Example #17
0
def retirement_promotion_estimates(person_year_table, profession,
                                   sampling_year_range, out_dir):
    """
    Estimate how many people retire and move up the legal hierarchy (i.e. get promoted) every year, both in raw counts
    and relative to the population of people open to such retirement.

    Post-2005 we have the complete population of magistrates (i.e. judges and prosecutors) but pre-2005 we have only
    non-random samples. For judges I sample three appellate areas (Alba, Craiova, Iaşi, and Ploieşti) because I have
    yearly data on all courts in these areas since at least 1980. That said, mobility estimates from these samples
    need to be corrected. In particular, I look at three sorts of mobility: retirement, promotion, and entry.

    Post-2005 we are certain that someone retires when they are in the population in year X, but absent in year X+1.
    For the pre-2005 we can't be certain, because that person may have left the sample but stayed in the population,
    i.e. they have simply changed appellate area. I therefore correct sample estimates as follows:

    - for the intervals 2006-2007, 2007-2008, and 2008-2009, see how many magistrates in the sampled areas (Alba,
      Craiova, Iaşi, and Ploieşti) actually retired, and how many just left their respective area. Compute the ratio
      "retirement counts" / "retirement counts + area leaving counts" for each interval, and take the three-interval
      average. The result is a weight: X% of the people that departed the sampled areas actually retired. There is one
      ratio for each judicial level (i.e. low court, tribunal, and appeals).

    - for pre-2005 I count how many people left the sample, then multiply the per-level count by the appropriate weight.
      Obviously, this assumes that the ratio between retirements and area changes is constant over this period. I cannot
      numerically check that assumption.

    Regarding promotion, post-2005 we can just see if someone's judicial level increased between years. Pre-2005 this
    count will be based in the sample because a) those who receive a promotion outside the sample look show up as
    retirements, b) those who entered the sample upon promotion look like new entrants. To address this I construct two
    weights: the ratio of within-area promotions to total promotions, and the ratio of entrants-by-promotion to total
    entrants (per year, per level).

    The final count of (weighted) sample promotions is then computed as follows:
    raw count * 1 / within-total-ratio  + count entrants * promotion-entrants-to-total-ratio

    Finally, to estimate the number of entrants, into the profession using the sample, I do the following:
    count entrants * (1 - promotion-entrants-to-total-ratio).

    Again, the assumption is that the relative balance of inter-area mobility flows is constant throughout the period
    under study, and therefore that ratios derived from 2006-2009 are true of other times as well. I choose the
    2006-2009 period because it's a) the earliest population-level data, and b) this period did not feature major
    judicial reforms.

    Finally, also want estimates of the total size of the population, and of year-on-year population growth.

    :param person_year_table: a table of person-years, as a list of lists; NB: asumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004)
    :param out_dir: directory where tables of mobility estimates will live
    :return: None
    """

    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')

    # sort person-year table by person then year
    person_year_table.sort(key=operator.itemgetter(pid_col_idx, yr_col_idx))

    # sample all courts in these appeals regions: Alba (CA1), Craiova (CA7), Iaşi (CA9), Ploieşti (CA12)
    appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]
    cas_sample_table = sample.appellate_area_sample(person_year_table,
                                                    profession,
                                                    appellate_areas_to_sample)

    # get weights for retirement, promotion, and entry

    # for those appeals areas, for periods 2006-2007 and 2007-2008, per hierarchical level:
    # a) get ratio of within-area promotions (i.e. people who were already in the area) to total promotions
    # b) get ratio of retirements to retirements + out-of-area transfers
    # Average the values for 2006-07 and 2007-08: these will be weights for estimates from earlier years
    weights = three_year_average_weights(person_year_table, profession,
                                         appellate_areas_to_sample,
                                         ["2006", "2007", "2008"])
    retirement_weights = weights["ret_weight"]
    internal_promotion_weights = weights["int_prom_weight"]
    external_promotion_weights = weights["ext_prom_weight"]

    # get raw counts of entries, retirements and promotions per year, per level, in the desired time-frame
    counts = get_raw_counts(cas_sample_table, profession, sampling_year_range)
    ent_counts, ret_counts, prom_counts = counts["entries"], counts[
        "retirements"], counts["promotions"]
    # now weigh those counts with average ratios from 2006-2008. Recall (counts are from sample):
    # estimated retirements = retirement count * retirement weight
    # estimated promotions = promotion count * (1 / interior promotion weight) + entry count * external promotion weight
    # estimated entries = entry count * (1 - external promotion weight)
    for key in internal_promotion_weights:
        for year in ret_counts.keys():
            # round up since these are whole people
            ret_counts[year][key] = round(
                float(ret_counts[year][key]) * retirement_weights[key])
            prom_counts[year][key] = round(
                float(
                    helpers.weird_division(prom_counts[year][key],
                                           internal_promotion_weights[key]) +
                    float(ent_counts[year][key]) *
                    external_promotion_weights[key]))
            ent_counts[year][key] = round(
                ent_counts[year][key] * (1 - external_promotion_weights[key]))

    # relabel, strictly for clarity (notice it's not a deepcopy)
    weighted_ret_counts = ret_counts
    weighted_prom_counts = prom_counts
    weighted_ent_counts = ent_counts

    # using (weighted-estiamted) sample counts, estimate yearly, per-level departure and retirement probabilities, where
    # denominator is sample count of person-years in year X; also estimate what proportion in each year's sample are
    # new entrants
    yearly_counts = counts["total counts"]

    retire_probs = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in yearly_counts.keys()
    }
    promotion_probs = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in yearly_counts.keys()
    }
    entry_proportions = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in yearly_counts.keys()
    }

    for year in yearly_counts:
        for lvl in yearly_counts[year]:
            promotion_probs[year][lvl] = helpers.weird_division(
                weighted_prom_counts[year][lvl], (yearly_counts[year][lvl]))
            retire_probs[year][lvl] = helpers.weird_division(
                weighted_ret_counts[year][lvl], yearly_counts[year][lvl])
            # NB: entry proportions is simple: how many of this year's samples are newcomers?
            entry_proportions[year][lvl] = helpers.weird_division(
                weighted_ent_counts[year][lvl], yearly_counts[year][lvl])

    # estimate the size of the professional population for years for which we only have samples
    estimated_pop = estimated_population_size(person_year_table,
                                              cas_sample_table, profession,
                                              sampling_year_range)

    # estimate year-on-year population growth
    estimated_pop_growth = estimated_population_growth(estimated_pop,
                                                       sampling_year_range)

    # save to disk one table each for retirements, entries, and departures,
    # and one table for estimated population size and growth
    with open(out_dir + "retirements.csv", 'w') as out_ret:
        writer = csv.writer(out_ret)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "LEVEL", "PROJECTED COUNT RETIREMENTS",
            "SAMPLE RETIREMENT PROBABILITY"
        ])
        for year in weighted_ret_counts:
            for lvl in weighted_ret_counts[year]:
                writer.writerow([
                    year, lvl, weighted_ret_counts[year][lvl],
                    retire_probs[year][lvl]
                ])

    with open(out_dir + "promotions.csv", 'w') as out_prom:
        writer = csv.writer(out_prom)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "LEVEL", "PROJECTED COUNT PROMOTIONS",
            "SAMPLE PROMOTION PROBABILITY"
        ])
        for year in weighted_prom_counts:
            for lvl in weighted_prom_counts[year]:
                if lvl in weighted_prom_counts[
                        year] and lvl in promotion_probs[year]:
                    writer.writerow([
                        year, lvl, weighted_prom_counts[year][lvl],
                        promotion_probs[year][lvl]
                    ])

    with open(out_dir + "entries.csv", 'w') as out_ent:
        writer = csv.writer(out_ent)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "LEVEL", "PROJECTED COUNT ENTRIES",
            "SAMPLE ENTRY PROPORTIONS"
        ])
        for year in weighted_ent_counts:
            for lvl in weighted_ent_counts[year]:
                writer.writerow([
                    year, lvl, weighted_ent_counts[year][lvl],
                    entry_proportions[year][lvl]
                ])

    with open(out_dir + "growth.csv", 'w') as out_grow:  # lol
        writer = csv.writer(out_grow)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "PROJECTED POPULATION",
            "SAMPLE PERCENT GROWTH SINCE PREVIOUS YEAR"
        ])
        for year in estimated_pop:
            if year == min(sorted(list(estimated_pop.keys()))
                           ):  # only know pop growth after second year
                writer.writerow([year, estimated_pop[year], "NA"])
            else:
                writer.writerow(
                    [year, estimated_pop[year], estimated_pop_growth[year]])
def spurious_name_change_identifiers(inferior_year_careers_end, superior_year_careers_start, id_changes,
                                     associated_names_log, change_distr_year, profession, fullnames=False):
    """
    This function finds names that erroneously change between two years, so that it looks like a person retired in
    year X and another one joined in year X+1, when it's really just the same person under two different names.

    After finding the name change error, it corrects it by assigning both names the same unique person ID.

    This function uses three methods to find out spurious name changes: by looking at surname changes, given name
    changes, and slight changes in full name spelling (where "slight" is defined as "3 Levenshtien/edit distance").

    :param inferior_year_careers_end: list of person years (each a list) from people whose careers ended in said year;
                                      this is the lower year of a pair of years, e.g. 2004 out of (2004, 2005)
    :param superior_year_careers_start: list of person years (each a list) from people whose careers began in said year;
                                        this is the higher year of a pair of years, e.g. 2005 out of (2004, 2005)
    :param id_changes: a dict recording the mapping of IDs. When we realise that two different IDs refer to the
                       same person, this dict tells us which ID to map to which, e.g. 123 : 127 means that ID#123 should
                       become ID#127.
    :param associated_names_log: a list of lists which contains a csv.writer-friendly log, which will let us visually
                                 inspect which names we associated as belonging to the same person
    :param change_distr_year: dict, keys are year and values are counts of name-changes/standardisations per year. This
                              dictionary shows us the distribution of changes across years, helps us see which years
                              seem to require more correction (to see if that patterns makes sense with other known
                              information from data  collection)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param fullnames: bool, True if correcting for spurious full name changes, False if separately correcting for both
                      given name and surname spurious changes; fullname = surname + given names
    :return: None, just updates dicts and lists
    """

    workplace_cod_idx = helpers.get_header(profession, "preprocess").index("instituţie")
    yr_col_idx = helpers.get_header(profession, "preprocess").index("an")
    surname_col_idx = helpers.get_header(profession, "preprocess").index("nume")
    given_name_col_idx = helpers.get_header(profession, "preprocess").index("prenume")
    pid_col_idx = helpers.get_header(profession, "preprocess").index('cod persoană')
    gend_col_idx = helpers.get_header(profession, "preprocess").index('sex')

    # FOR THE SURNAME CHANGE CATCHER
    # in each profession there are certain name in the inferior year that we know (from poking at the tables) which map
    # onto more than name in the superior year, which is a noise-introducing error; mark these names so we avoid them
    judges_names_skip = {"ALDEA | MARIA", "ATUDOSEI | MARIA", "BARBU | LARISA LUMINIŢA",
                         "BRĂESCU | MIHAELA", "CERNAT | DOINIŢA",
                         "FRUNZĂ | NECTARA NICOLETA", "GIANGU | MARIA", "HRIB | CRISTINA",
                         "IVĂNUŞCĂ | MARIA", "LEFTER | VIORICA", "MIHALCEA | DOINIŢA",
                         "MUDAVA | SIMONA", "MUNTEAN | DANIELA", "VODA | CODRUŢA",
                         "MĂNĂSTIREANU | CRISTINA", "NECULAU | CRISTINA",
                         "PREDAN | GEORGETA", "SCRIMINŢI | ELENA", "TEODORESCU | OANA",
                         "STOICA | DOINIŢA", "TRANDAFIR | DOINIŢA", "URSACHE | MARIA"}

    name_pairs_lev_dist_apart = []
    if fullnames:
        # FOR THE FULL NAME CHANGE CATCHER
        # get lists of all full names, one list each for the inferior year and superior years
        # and compute Levenshtein/edit distance between the inferior and superior year name lists
        # NB: chose Levenshtein distance of 3 after experimentation, found this number has good true to false
        # positive rate
        inf_full_names = [py[surname_col_idx] + ' | ' + py[given_name_col_idx] for py in inferior_year_careers_end]
        sup_full_names = [py[surname_col_idx] + ' | ' + py[given_name_col_idx] for py in superior_year_careers_start]
        name_pairs_lev_dist_apart = standardise.pairwise_ldist(inf_full_names + sup_full_names, 3)

    # do a pairwise comparison of person-years ending in inferior year and person-years beginning in superior year
    for inf_py in inferior_year_careers_end:
        inf_sn, inf_gn = inf_py[surname_col_idx], inf_py[given_name_col_idx]
        inf_fn, inf_gndr = inf_sn.split() + inf_gn.split(), inf_py[gend_col_idx]

        for sup_py in superior_year_careers_start:
            sup_sn, sup_gn = sup_py[surname_col_idx], sup_py[given_name_col_idx]
            sup_fn, sup_gndr = sup_sn.split() + sup_gn.split(), sup_py[gend_col_idx]

            if fullnames:
                # SPURIOUS FULL NAME CHANGE DETECTOR
                for name_pair in name_pairs_lev_dist_apart:
                    lev_inf_fn, lev_sup_fn = inf_sn + ' | ' + inf_gn, sup_sn + ' | ' + sup_gn

                    # if the fullnames differ by 3 characters
                    if lev_inf_fn in name_pair and lev_sup_fn in name_pair:

                        # we only look for putative changes within one workplace
                        if inf_py[workplace_cod_idx] == sup_py[workplace_cod_idx]:

                            # only consider cases where the gender does not change; by convention, map/standardise from
                            # the person ID from the superior year to the person ID from the inferior year
                            if inf_gndr == sup_gndr:
                                id_changes.update({sup_py[pid_col_idx]: inf_py[pid_col_idx]})
                                associated_names_log.append([inf_fn, sup_fn, "", "lev_dist_corrector"])
                                change_distr_year[inf_py[yr_col_idx]] += 1
            else:

                # SPURIOUS GIVEN NAME CHANGE DETECTOR
                # the given names must differ, the surnames must be identical
                if inf_gn != sup_gn and inf_sn == sup_sn:

                    # we only look for putative changes within one workplace
                    if inf_py[workplace_cod_idx] == sup_py[workplace_cod_idx]:

                        # only consider cases where the gender does not change
                        if inf_gndr == sup_gndr:
                            id_changes.update({sup_py[pid_col_idx]: inf_py[pid_col_idx]})
                            associated_names_log.append([inf_fn, sup_fn, "", "given_name_corrector"])
                            change_distr_year[inf_py[yr_col_idx]] += 1

                # SPURIOUS SURNAME CHANGE DETECTOR
                # the given names must be identical, the surnames must differ
                if inf_gn == sup_gn and inf_sn != sup_sn:

                    # avoid longer names (too much noise), and only analyse women;
                    # e.g. ['RADOSLAV', 'MIHAELA', 'NARCISA'] --> ['BUZAN', 'MUDAVA', 'MIHAELA', 'NARCISA']
                    if (len(inf_fn) < 4 and len(sup_fn) < 4) and (inf_gndr == "f" and sup_gndr == "f"):

                        # we only look for putative changes within one workplace
                        if inf_py[workplace_cod_idx] == sup_py[workplace_cod_idx]:

                            # if one full name has multiple surnames, at least one of these surnames must
                            # be found in the other name; this avoids transformations such as
                            # avoid a given set of full names
                            # RICINSCHI | MARIA  -->  DOMINTE POPA | MARIA
                            if len(inf_sn.split()) > 1 or len(sup_sn.split()) > 1:
                                if len(set(inf_sn.split()) & set(sup_sn.split())) > 0:

                                    # avoid names that have been ad-hoc marked as problematic
                                    if inf_sn + ' | ' + inf_gn not in judges_names_skip:
                                        id_changes.update({sup_py[pid_col_idx]: inf_py[pid_col_idx]})
                                        associated_names_log.append([inf_fn, sup_fn, "", "surname_corrector"])
                                        change_distr_year[inf_py[yr_col_idx]] += 1

                            else:  # single-surname cases
                                if inf_sn + ' | ' + inf_gn not in judges_names_skip:
                                    id_changes.update({sup_py[pid_col_idx]: inf_py[pid_col_idx]})
                                    associated_names_log.append([inf_fn, sup_fn, "", "surname_corrector"])
                                    change_distr_year[inf_py[yr_col_idx]] += 1
def continuity_sample(person_year_table, time_period, profession):
    """
    There are several points in time in which my datasets become increasingly restricted, e.g. before 2005 I only have
    data on half of the parquets, but after 2005 I have data on all the parquets.

    This functions samples data based on which institutions/units CONTINUE across a pre-defined time period;
    period bounds are included. For example, if my whole data is 1990-2010, but my time-period is 1995-2007, keep only
    those units for which we have data for both 1995 and 2007 (on the assumption that we also have data for all the
    years in between).

    The point is to make across-time comparison meaningful, since we're just studying those units that are there for
    the whole period, and not muddling things up by also trying to handle units that (dis)appear partway through.

    NB: units may appear and disappear over the time-period because
        a) the units were disbanded, e.g. Scorniceşti court
        b) the units were founded, e.g. DIICOT
        c) I do not have data on those units for the whole period, but that data does exist somewhere

    This function only returns data on units that were there throughout the entire period for which we have data.
    It does not distinguish between units with incomplete data due to substantive reasons (i.e. they were founded
    part-way through) as opposed to research reasons (i.e. we couldn't obtain full-period data for that unit).

    NB: as of 03.08.2020 this function is only meant to work with judges and prosecutors; I have complete data for the
    entire observation periods for the other professions

    :param person_year_table: a table, as a list of lists, where year row is a person-period (e.g. a person-month)
    :param time_period: tuple of ints, boundary years of the time period, e.g. (2005, 2015)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: a person-year table with continuity of allowable workplaces
    """

    year_idx = helpers.get_header(profession, 'preprocess').index('an')
    unit_name_idx = helpers.get_header(profession,
                                       'preprocess').index('instituţie')

    # make two sets: one of unit names that appear in the first year of the period, another set of unit names appearing
    # in the last year of the period
    first_year_unit_names = {
        py[unit_name_idx]
        for py in person_year_table if int(py[year_idx]) == time_period[0]
    }
    last_year_unit_names = {
        py[unit_name_idx]
        for py in person_year_table if int(py[year_idx]) == time_period[1]
    }

    # continuity units are those with data for both the first and the last years of the specified time period
    continuity_unit_names = first_year_unit_names & last_year_unit_names

    # only keep person-years whose year value fall within the specified time period
    period_years = set([y for y in range(time_period[0], time_period[1] + 1)])
    period_table = [
        py for py in person_year_table if int(py[year_idx]) in period_years
    ]

    # and which are associated with units which were there for the whole period
    period_unit_continuity_table = [
        py for py in period_table if py[unit_name_idx] in continuity_unit_names
    ]

    return period_unit_continuity_table
def kin_match(recruit_data, old_pers_data, pids_chamber_dict, common_surnames,
              profession):
    """
    Applies the kinship matching rules, returns True if there's a match.

    The rule is: if the recruit shares at least one surname with the older profession member AND their chambers match,
    then they're considered kin. The exceptions are:
        - if surnames match AND the your office infos differ by at most three Levenshtein distance then we ignore
        other geographic considerations and match you as kin
        - if surnames match and either their full name is in your office info, or your full name is in their office
        info, then ignore other geographic considerations and match you as kin
        - if one of the surnames in the most common names, then we need a match on BOTH surnames
          before accepting the match
        - if the town is Bucharest then the match has to be not only on chamber but also on town/localitate;
          NB: this puts in an asymmetry where recuits from Bucharest CHAMBER can match BUCHAREST town, but recruits
              from BUCHAREST town must match ONLY Bucharest town (not the wider chamber); this is intentional, to
              allow for people from Bucharest town placing their kin in the wider chamber, but not vice verse, since
              it's harder for peripherals to get a foothold downtown than the other way around

    NB: chamber ("camera") indicates the appellate court jurisdiction in which the professional operates. This is also
    the lowest level territorial, professional organisation for notaries and executori.

    NB: the most recent chamber of the person already in the profession can match ANY ONE of the chambers in the career
        of the recruit. This accounts for the pattern that inheritors sometimes start in a different chamber (where
        there's an open spot) then move in the town of their kin as soon as possible.

    :param recruit_data: a list of data values (i.e. a row) for a new recruit;
                         data in order of preprocessed headers, see helpers.helpers.get_header under 'preprocess'
    :param old_pers_data: a list of data values (i.e a row) for a person that was there before the new recruit
                         data in order of preprocessed headers, see helpers.helpers.get_header under 'preprocess'
    :param pids_chamber_dict: dict where keys are unique person-IDs and values are lists of the chambers that person
                              has been in
    :param common_surnames: set of strings, of most common surnames
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: bool, True if there's a match, false otherwise
    """

    # get column indexes
    surname_col_idx = helpers.get_header(profession,
                                         'preprocess').index('nume')
    chamber_col_idx = helpers.get_header(profession,
                                         'preprocess').index('camera')
    town_col_idx = helpers.get_header(profession,
                                      'preprocess').index('localitatea')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')

    # get data; NB, surnames turned to bags, which automatically deduplicates surnames, e.g. STAN STAN --> STAN
    rec_pid, rec_sns = recruit_data[pid_col_idx], set(
        recruit_data[surname_col_idx].split(' '))
    rec_town = recruit_data[town_col_idx]
    old_pers_sns, old_pers_chamb = set(old_pers_data[surname_col_idx].split(
        ' ')), old_pers_data[chamber_col_idx]
    old_pers_town = old_pers_data[town_col_idx]

    # initiate set of matches
    matches = set()

    # for each surname
    for sn in rec_sns:

        # if match on surnames and on offices (bar typo); office info only available for executori
        if profession == 'executori':
            sediu_col_idx = helpers.get_header(profession,
                                               'preprocess').index('sediul')
            rec_sediu, old_pers_sediu = recruit_data[
                sediu_col_idx], old_pers_data[sediu_col_idx]
            if rec_sediu != '-88':  # they need some office info, not just empties
                if len(rec_sns & old_pers_sns) > 0 and Levenshtein.distance(
                        rec_sediu, old_pers_sediu) <= 3:
                    matches.add(True)

        # if the sn is not among the most common
        if sn not in common_surnames:
            # if there's at least one name in common AND recruit and person already there share 1+ chambers
            if len(rec_sns & old_pers_sns
                   ) > 0 and old_pers_chamb in pids_chamber_dict[rec_pid]:
                # if town is NOT Bucharest
                if rec_town != "BUCUREŞTI":
                    matches.add(True)
                else:  # recruit's town is Bucharest, old person also needs to be in Bucharest
                    if old_pers_town == "BUCUREŞTI":
                        matches.add(True)
        else:  # if the surname is common, need match on two surnames
            if len(rec_sns & old_pers_sns) > 1:
                matches.add(True)
    # if there's at least one match, return True
    return True if True in matches else False
Example #21
0
def demean_periods_by_person(input_table_path, outpath_demeaned_table,
                             profession):
    """
    Makes columns with demeaned values for the period dummies, where the person is the group on which we demean.
    Also include columns with the person-level mean of the period dummies.

    :param input_table_path: path to table, some of whose variables we'll demean
    :param outpath_demeaned_table: path where table with demeaned variables will live
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :return: None
    """

    # load up the old table, initialise the demeaned table with the new table's header
    with open(input_table_path, 'r') as in_f:
        input_table = list(csv.reader(in_f))
    old_header = input_table[0]

    demeaned_table = []

    # sort the table by person and year, group by person; skip header in old table
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    input_table = sorted(input_table[1:],
                         key=operator.itemgetter(pid_col_idx, yr_col_idx))
    people = [
        person for k, [*person] in itertools.groupby(
            input_table, key=operator.itemgetter(pid_col_idx))
    ]

    demeaned_table_header = old_header

    # indexes for csm, codes, and indep dummies are 16, 17, 18

    # demean the specified variable, where the "mean" of the variable is taken across the specified stratum
    for person in people:
        csm_mean = statistics.mean([int(pers_yr[16]) for pers_yr in person])
        codes_mean = statistics.mean([int(pers_yr[17]) for pers_yr in person])
        indep_mean = statistics.mean([int(pers_yr[18]) for pers_yr in person])

        # now demean the existing values, and add new row to the demeaned table
        for pers_yr in person:
            demeaned_csm = int(pers_yr[16]) - csm_mean
            demeaned_codes = int(pers_yr[17]) - codes_mean
            demeaned_indep = int(pers_yr[18]) - indep_mean

            demeaned_table.append(pers_yr + [
                demeaned_csm, csm_mean, demeaned_codes, codes_mean,
                demeaned_indep, indep_mean
            ])

    demeaned_table_header.extend([
        "demeaned_csm", "csm_mean", "demeaned_codes", "codes_mean",
        "demeaned_indep", "indep_mean"
    ])
    demeaned_table.insert(0, demeaned_table_header)

    # write the demeaned table to disk
    with open(outpath_demeaned_table, 'w') as out_f:
        writer = csv.writer(out_f)
        [writer.writerow(pers_yr) for pers_yr in demeaned_table]
Example #22
0
def adjusted_retirement_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of retirement is that it does not distinguish between people who genuinely
    leave the profession and those who simply leave the sample (i.e. move to another area) but remain in the profession.
    Consequently, raw sample retirement counts are biased upwards because profession-exits and sample-exits are
    implicitly equated.

    The solution is to use the total population to compute the fraction of retirements from the sample area that are
    genuine departures from the profession and then to multiply the raw sample retirement count by that fraction,
    thereby reducing the upward bias. To be exact, the genuine retirement fraction is computed by

      genuine retirement fraction = genuine retirement counts / (genuine retirement counts + sample-leaving counts)

    and the adjusted retirement count will therefore be

      adjusted number of retirements = raw sample retirement count * genuine retirement fraction

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted retirement counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    ret_fracts = {lvl: {"gen_rets": 0, "samp_leaves": 0} for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]
            # if this year is used for the fraction, and within the sampling areas
            if int(current_yr) in fracts_yrs and current_area in samp_as:
                if idx < len(person) - 1:  # since we do look-aheads to see departures-cum-retirements
                    # if next year's area is NOT within the sampling area, increment sample departures
                    if person[idx + 1][ca_cod_idx] not in samp_as:
                        ret_fracts[current_lvl]["samp_leaves"] += 1

                # if last year is used for the fraction and within the sampling areas, increment genuine retirements
                else:  # NB: this always assume we pick a sampling year than is less than the right censoring year
                    ret_fracts[current_lvl]["gen_rets"] += 1

    # average over the years then get the final fraction, per level
    for lvl in ret_fracts:
        avg_gen_rets = float(ret_fracts[lvl]["gen_rets"]) / float(len(fracts_yrs))
        avg_samp_leave_rets = float(ret_fracts[lvl]["samp_leaves"]) / float(len(fracts_yrs))
        ret_fracts[lvl] = helpers.weird_division(avg_gen_rets, (avg_gen_rets + avg_samp_leave_rets), mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_ret_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession,
                                                      cohorts=True, unit_type="nivel", entry=False)
    samp_ret_counts.pop("grand_total")  # don't need the grand total

    # and weigh them; round result to four decimals
    for lvl in samp_ret_counts:
        for yr in samp_ret_counts[lvl]:
            samp_ret_counts[lvl][yr] = round(samp_ret_counts[lvl][yr]["total_size"] * ret_fracts[int(lvl)], 4)

    if weights:
        return ret_fracts
    else:
        return samp_ret_counts
Example #23
0
def build_data_table(input_py_table,
                     col_augmented_py_table,
                     profession,
                     period_dummies=False):
    """
    Adds a columns to our data table, including dummmies for the main independent variables. Writes the table to disk,
    so we can visually inspect.

    NB: when we put a person-year within a period, that particular year actually indicates the start year of a two-year
    interval in which mobility may occur, because we measure mobility inter-temporally: e.g. we know person X retired in
    that year because they're not in the data the next year. This explains certain period-coding choices, such as
    1989 not being coded as "communism" but as "revolution", since the mobility value for 1989 actually measures whether
    there was mobility in the interval 1989-1990, which as a whole was as revolutionary interval. The open bracket at
    the end of the period below means that we don't look at the interval starting with that year: so for [2012, 2016),
    for example, we consider 2012-13, 2013-14, 2014-15, 2015-16, but NOT 2016-17.

    The periods are:
     - [2005, 2012): CSM reforms & "mica reformă", val = "csm"
     - [2012, 2016): legal codes reforms, val = "codes"
     - [2016, 2020): judicial independence reforms, val = "indep"

    :param input_py_table: person-year table (as list of lists), without columns necessary for the hazard models
    :param col_augmented_py_table: person-year table (as list of lists), WITH new, desired columns
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param period_dummies: bool, True if you want additional columns which put in one dummy column for each period
    :return: None
    """

    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # initialise the periodisation
    years_periods = {
        2005: "csm",
        2006: "csm",
        2007: "csm",
        2008: "csm",
        2009: "csm",
        2010: "csm",
        2011: "csm",
        2012: "codes",
        2013: "codes",
        2014: "codes",
        2015: "codes",
        2016: "indep",
        2017: "indep",
        2018: "indep",
        2019: "indep"
    }

    # load the old table, saving the original header
    with open(input_py_table) as in_f:
        person_year_table = list(csv.reader(in_f))
    header = person_year_table[0]

    # initialise new table, with updated header
    new_table = [
        header + ["entry", "promotion", "retire", "career_length", "period"]
    ]

    # sort old table by unique person ID and year (skip header), and group it by persons
    person_year_table = sorted(person_year_table[1:],
                               key=operator.itemgetter(pid_col_idx,
                                                       yr_col_idx))
    people = [
        person for k, [*person] in itertools.groupby(
            person_year_table, key=operator.itemgetter(pid_col_idx))
    ]

    # iterate through people, adding columns and extending the new, column-augmented table
    for person in people:

        for idx, pers_yr in enumerate(person):

            # order of columns is: retire, entry, promoted, period, career length
            entry, promotion, retire, career_length, period = 0, 0, 0, 0, ''

            # by convention, first year in career is "1 year of experience," i.e. 1-indexing
            career_length = idx + 1

            # add period; if a certain year is not in our periodisation, leave empty value
            if int(pers_yr[yr_col_idx]) in years_periods:
                period = years_periods[int(pers_yr[yr_col_idx])]

            # first year of career
            if idx < 1:
                entry = 1

            # non edge years
            elif 0 < idx < len(person) - 1:

                # if your level is lower this year than next, it means you're promoted;
                # by convention, promotion is marked in the anterior year
                if person[idx - 1][lvl_col_idx] < pers_yr[lvl_col_idx]:
                    promotion = 1

            else:  # last year of career
                retire = 1

            new_row = pers_yr + [
                entry, promotion, retire, career_length, period
            ]

            new_table.append(new_row)

    # if period dummy switch is on, turn "period" column from factors to dummies, appended to end of table
    if period_dummies:
        new_table_with_dummies = [
            new_table[0] + ["csm_dummies", "codes_dummies", "indep_dummies"]
        ]
        period_col_index_dict = {"csm": 0, "codes": 1, "indep": 2}
        for py in new_table[1:]:  # skip header
            period = py[-1]
            dummy_vals = ['', '', '']
            if period:  # skips rows for which there is no periodisation
                dummy_vals = [0, 0, 0]
                dummy_vals[period_col_index_dict[period]] = 1
            new_table_with_dummies.append(py + dummy_vals)
        new_table = new_table_with_dummies

    # write table to disk
    with open(col_augmented_py_table, 'w') as out_f:
        writer = csv.writer(out_f)
        [writer.writerow(person_year) for person_year in new_table]
Example #24
0
def adjusted_entry_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of entries is that it does not distinguish between people who are genuinely
    new recruits to the profession, and those who were already in the profession but outside the sample. Consequently,
    the raw count is biased upwards because it equates entering the sample from within the profession with entering
    the profession tout-court.

    The solution is to use the total population to compute the fraction of entries into the sample that are genuine
    recruits into the profession and then to multiply the raw sample entry count by that fraction, thereby reducing the
    upward bias. To be exact, the genuine entry fraction is computed by

      genuine entry fraction = genuine entry counts / (genuine entry counts + sample-entering counts)

    and the adjusted entry count will therefore be

      adjusted number entries = sample entry count * genuine entry fraction

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted entry counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    ent_fracts = {lvl: {"gen_ents": 0, "samp_ents": 0} for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]

            # if this year is used for the fraction and this year is within the sample area
            if int(current_yr) in fracts_yrs and current_area in samp_as:

                # if it's genuinely the first year, increment genuine entries
                #  NB: this always assumes that we skip the left censor year
                if idx == 0:  # the first year of the career;
                    ent_fracts[current_lvl]["gen_ents"] += 1

                if 1 < idx:  # since we do look-behinds to see if someone entered the sample from elsewhere

                    # if LAST year's appellate area is different from this year's appellate area, increment count of
                    # extra-sample entries
                    if current_area != person[idx - 1][ca_cod_idx]:
                        ent_fracts[current_lvl]["samp_ents"] += 1
    # average over the years then get the final fraction, per level
    for lvl in ent_fracts:
        avg_gen_ents = float(ent_fracts[lvl]["gen_ents"]) / float(len(fracts_yrs))
        avg_samp_ents = float(ent_fracts[lvl]["samp_ents"]) / float(len(fracts_yrs))
        ent_fracts[lvl] = helpers.weird_division(avg_gen_ents, (avg_gen_ents + avg_samp_ents), mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_ent_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession,
                                                      cohorts=True, unit_type="nivel", entry=True)
    samp_ent_counts.pop("grand_total")  # don't need the grand total
    # and weigh them; round result to four decimals
    for lvl in samp_ent_counts:
        for yr in samp_ent_counts[lvl]:
            samp_ent_counts[lvl][yr] = round(samp_ent_counts[lvl][yr]["total_size"] * ent_fracts[int(lvl)], 4)

    if weights:
        return ent_fracts
    else:
        return samp_ent_counts
Example #25
0
def adjusted_lateral_transfer_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of lateral trasnfers is that it is biased downward, for two reasons:

     a) those who trasnfer laterally to a position outside the sample will appear have retired, thus biasing the
        lateral transfer count downward

     b) those who entered the sample via lateral transfer from outside the sample will appear to be new entrants, thus
        biasing the lateral transfer count downward

    Essentially, the sample only counts lateral transfers that occur within the sample, ignoring those lateral transfers
     that feature sample entry or departure.

    To fix this bias we use the total populating to compute the genuine fraction of lateral transfers, namely

      genuine promotion ratio = (within-sample lateral transfers +
                                 lateral transfers leaving the sample +
                                 lateral transfers entering the sample)
                                            /
                                   within-sample lateral transfers

    and the adjusted lateral transfer count will therefore be

      adjusted number of lateral transfers = within-sample lateral transfer count * genuine lateral transfer ratio

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted lateral transfer counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    trans_fracts = {lvl: {"within_samp_transfs": 0, "samp_leave_transfs": 0, "samp_ent_transfs": 0}
                    for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]

            # if this year is used for the fraction and this year is within the sample area
            if int(current_yr) in fracts_yrs and current_area in samp_as:

                if idx < len(person) - 1:  # since we do look-aheads to judge mobility within or leaving the sample

                    # if current hierarchical level is equal to NEXT year's AND the exact workplaces differ
                    # (i.e. there's a lateral transfer this year):
                    if current_lvl == int(person[idx + 1][lvl_col_idx]) and \
                            get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx + 1], profession):

                        # if next year's area is outside the sample, increment count of leaving-sample transfers
                        if person[idx + 1][ca_cod_idx] not in samp_as:
                            trans_fracts[current_lvl]["samp_leave_transfs"] += 1

                    else:
                        # if next year's area is within the sample, increment the count of within-sample demotions
                        if person[idx + 1][ca_cod_idx] in samp_as:
                            trans_fracts[current_lvl]["within_samp_transfs"] += 1

                if 1 < idx:  # we do look behinds to see if someone entered the sample from elsewhere:

                    # if LAST year's hierarchical level was the same as this year's AND the exact workplaces different
                    # (i.e. a lateral transfer occurred last year)
                    if int(person[idx - 1][lvl_col_idx]) == current_lvl and \
                            get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx - 1], profession):
                        # if last year's area was not within the sample, increment the count of extra-sample entries via
                        # lateral transfer
                        trans_fracts[current_lvl]["samp_ent_transfs"] += 1

    # average over the years then get the final fraction, per level
    for lvl in trans_fracts:
        avg_within_samp_transfs = float(trans_fracts[lvl]["within_samp_transfs"]) / float(len(fracts_yrs))
        avg_samp_leave_transfs = float(trans_fracts[lvl]["samp_leave_transfs"]) / float(len(fracts_yrs))
        avg_samp_ent_transfs = float(trans_fracts[lvl]["samp_ent_transfs"]) / float(len(fracts_yrs))
        trans_fracts[lvl] = helpers.weird_division((avg_within_samp_transfs +
                                                    avg_samp_leave_transfs +
                                                    avg_samp_ent_transfs),
                                                   avg_within_samp_transfs, mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_transf_counts = hierarchical.hierarchical_mobility(cas_sample_table, profession)

    # and weigh them; round result to four decimals
    for yr in samp_transf_counts:
        for lvl in samp_transf_counts[yr]:
            samp_transf_counts[yr][lvl] = round(samp_transf_counts[yr][lvl]["across"]["total"] * trans_fracts[lvl], 4)

    if weights:
        return trans_fracts
    else:
        return samp_transf_counts
def profession_inheritance(person_year_table, profession, year_window=1000):
    """
    Check if each person inherited their profession.

    The assumption is that if in the year that you enter the profession one of your surnames matches the surname of
    someone who was in the profession before you AND who was, at any point, in the same chamber (appellate court
    jurisdiction) as you are upon entry, then you two are kin. More strict match rules for common surnames and the city
    of Bucharest are discussed in the comments of the  relevant match criteria.

    NB: because we consider overlap with ANY surnames (to catch people who add surnames, which is especially
    the case for married women) we make bags of all DISTINCT surnames, so a compound surname like "SMITH ROBSON"
    would become two surnames, "SMITH" and "ROBSON".

    NB: this function is meant to roughly identify kinship and err on the side of inclusion. It assumes that each
    match is then human-checked to weed out false positives, e.g. common surnames that coincidentally overlap. That's
    why I make a match log, not with this function but with that in describe.descriptives.profession_inheritance. Look
    for kin-match log files under analysis/descriptives/inheritance.

    NB: year_window controls how far back in time from the recruit's we look for a match. So if person X enters the
    profession in 2015, and the year window is 5, we look for matches in the interval 2010-2014, inclusive.

    :param person_year_table: a table of person-years, as a list of lists
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param year_window: int, how many years back we look for matches, e.g. "6" means we look for matches in six years
                        prior to your joining the profession; default is "1000", i.e. look back to beginning of data
    :return: a set of all PIDs who are inheritors
    """

    # get column indexes that we'll need
    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    surname_col_idx = helpers.get_header(profession,
                                         'preprocess').index('nume')
    given_name_col_idx = helpers.get_header(profession,
                                            'preprocess').index('prenume')
    chamber_col_idx = helpers.get_header(profession,
                                         'preprocess').index('camera')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')

    # the number of top-ranked surnames (out of the entire set of surnames) that we consider "common", and therefore
    # more liable to lead to false-positives in kin-matches;
    # NB: "0" means that we consider no names common, i.e. max false positive
    # NB: numbers were arrived at by consulting inheritance tables in analysis/descriptives/profession/inheritance
    surname_commonality_cutoffs = {'executori': 5, 'notaries': 14}
    num_top_names = surname_commonality_cutoffs[profession]

    # get set of common surnames across the entire person-year table
    common_surnames = top_surnames(person_year_table, num_top_names,
                                   profession)

    # get year range
    person_year_table.sort(key=itemgetter(year_col_idx))  # sort by year
    start_year, end_year = int(person_year_table[0][year_col_idx]), int(
        person_year_table[-1][year_col_idx])

    # group person-year table by year, make yearly subtables value in dict, key is year
    tables_by_year_dict = {
        int(pers_years[0][year_col_idx]): pers_years
        for key, [*pers_years] in itertools.groupby(
            person_year_table, key=itemgetter(year_col_idx))
    }

    # get full names for each yearly entry cohort
    yearly_entry_cohorts_full_names = cohort_name_lists(
        person_year_table, start_year, end_year, profession)

    # make dict where keys are person-ids and values are lists of chambers in which person has served
    pids_chamb_dict = {py[pid_col_idx]: set()
                       for py in person_year_table}  # initialise dict
    [
        pids_chamb_dict[py[pid_col_idx]].add(py[chamber_col_idx])
        for py in person_year_table
    ]  # fill it

    # initialise inheritance set
    inheritor_set = set()

    # starting with the second available year
    for current_year, current_person_years in tables_by_year_dict.items():
        if current_year != min(list(tables_by_year_dict)):

            # get all the people from the previous years
            people_already_here = people_in_prior_years(
                current_year, start_year, person_year_table, year_window,
                profession)

            # get this year's list of names of new recruits, i.e. fresh entrants
            recruits = yearly_entry_cohorts_full_names[current_year]

            # iterate through the current person years
            for py in current_person_years:
                rec_full_name = py[surname_col_idx] + ' | ' + py[
                    given_name_col_idx]  # recruit's full name

                # if that person is a new recruit;
                # NB: full names in 'recruits' are in format 'SURNAMES | GIVEN NAMES'
                if rec_full_name in recruits:

                    # compare recruit with everyone already in profession
                    for person_already in people_already_here:

                        # if recruit has a kin match with someone already in profession
                        if kin_match(py, person_already, pids_chamb_dict,
                                     common_surnames, profession):
                            # add match to inheritance dict
                            inheritor_set.add(py[pid_col_idx])

    # return the entries count dict
    return inheritor_set
def get_geographic_hierarchical_sequences(person_year_table, profession, outdir):
    """
    FOR JUDGES AND PROSECUTORS ONLY

    Makes a table with these columns:
        col 1  = person ID
        col 2 = cohort, i.e. first year in career
        col 3 = gender
        col 4 = length of career (in years)
        col 5 = began at low court
        col 6 = time to retirement (if occured)
        col 7 = time to tribunal promotion (if occurred)
        col 8 = time to court of appeals promotion (if occurred)
        col 9 = time to high court promotion (if occurred)
        col 10 = time to first regional moves (if occurred)
        col 11 = own cohort's average time to retirement (of those that retired)
        col 12 = own cohort's average time to tribunal promotion (of those that were thus promoted)
        col 13 = own cohort's average time to court of appeals promotion (of those that were thus promoted)
        col 14 = own cohort's average time to high court of promotion (of those that were thus promoted)
        col 15 = own cohort's average time to first region move (if occurred)
        col 16 = number of region changes in first 5 years on the job
        col 17 = number of region changes in first 10 years on the job
        col 18 = number of region changes in whole career
        col 19 = whole career sequence of hierarchical states
        col 20 = whole career sequence of relative regional location (explained below)
        col 21 = whole career, two-channel sequence of hierarchical - relative region locations (explained below)
        col 22 = reverse order, whole career, two channel sequence (i.e. going from end of career to beginning)
        col 23 = normal order, two-channel sequence, truncated at 10 years
        col 24 = normal order, two-channel sequence, truncated at five years

    The hierarchical position code shows where in the judicial hierarchy the person is. The elements are:
    LC = municipal, low court, TB = county tribunal, CA = regional court of appeals, HC = national high court.

    The relative regional location code shows where a person was relative to the first region in which they are
    observed. Regions are defined as court of appeals jurisdictions. Everyone's starting region is "1", then if they
    move to another court of appeals jurisdiction they get the label "2", and get the label "3" if they move to a third
    region. If they move to a fourth, fifth, sixth, etc. region they get the label "4+". So the alphabet is 4-long.
    Note that if a person's sequence is something like "1-2-2-2-2-1-1-1" it means that they left their home region,
    stayed in a second region for a while, then returned to their home region.

    NB: the sequences_log file actually records how many people are observed having moved to 4, 5, 6, etc. regions so
        we can see in greater detail if need be

    We then combine these two element sets (alphabet expansion) to create the final set of elements that we use to
    construct the multi-channel sequences. Below are some examples, the rest follow logically:
        LC+1 = low court, first region ; TB+2 = tribunal, second region; CA+3 = court of appeals, third region

    The sequences will look like e.g. LC+1-LC+1-TB+1-TB+2-TB+2, where each year is separated by a dash ("-").

    :param person_year_table: a table of person-years, as a list of lists
    :param profession: string, "judges", "prosecutors"
    :param outdir: directory in which we want the data and log files to live
    :return: None
    """

    # get indices for person ID, year, and gender
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    gend_col_idx = helpers.get_header(profession, 'preprocess').index('sex')

    years = {pers_yr[yr_col_idx] for pers_yr in person_year_table}
    right_censor_year = max({pers_yr[yr_col_idx] for pers_yr in person_year_table})

    # initialise the person-sequence table
    person_sequences_table = []

    # initialise one set of personal IDs of people in two places at once, and another of people with career gaps
    two_places_at_once, career_gaps = set(), set()

    # initialise a dict of cohorts and mean times to events per cohort
    time_to_event_dict = {"TB times": [], "CA times": [], "HC times": [], "ret times": [], "geog move times": []}
    chrt_time_to_event_dict = {chrt: deepcopy(time_to_event_dict) for chrt in years}

    # initialise sequences pools; we'll use these for calculating element frequencies
    hierar_pool, rel_reg_pool, twochannel_pool = [], [], []

    # sort people by unique ID and year, then group by unique ID
    person_year_table.sort(key=operator.itemgetter(pid_col_idx, yr_col_idx))
    people = [person for key, [*person] in itertools.groupby(person_year_table, key=operator.itemgetter(pid_col_idx))]

    # for each person
    for person in people:
        entry_yr = person[0][yr_col_idx]
        pid = person[0][pid_col_idx]
        gndr = person[0][gend_col_idx]

        # remove people with weird careers that indicate some underlying coding errors

        # get a sorted list of all the unique years of a person's career
        career_yrs = sorted(list({int(pers_yr[yr_col_idx]) for pers_yr in person}))
        # if a person is in two places at once they'll have more person-years than years; save to log and skip
        if len(person) > len(career_yrs):
            two_places_at_once.add(pid)
            continue
        # if a person has fewer person-years than the span of their career, there are gaps; save to log and skip
        if len(person) < len(list(range(career_yrs[0], career_yrs[-1] + 1))):
            career_gaps.add(pid)
            continue

        # get sequences
        pers_measures = get_person_measures(person, profession, right_censor_year)
        hierar_seq = pers_measures["hierar seq"]
        rel_reg_seq = pers_measures["rel reg seq"]
        twochannel_seq = [i[0] + "+" + str(i[1]) for i in list(zip(hierar_seq, rel_reg_seq))]
        twochannel_reverse = twochannel_seq[::-1]
        twochannel_trunc_ten = twochannel_seq[:10]
        twochannel_trunc_five = twochannel_seq[:5]

        # add indicator for whether a person began their career at low court (if not, they came extraprofessionally)
        began_at_low_court = 1 if hierar_seq[0] == "LC" else 0

        # add the base sequences to their respective pools
        hierar_pool.extend(hierar_seq), rel_reg_pool.extend(rel_reg_seq), twochannel_pool.extend(twochannel_seq)

        # turn your sequences to strings , with elements separated by a hyphen ("-")
        hierar_seq, rel_reg_seq, twochannel_seq = "-".join(hierar_seq), "-".join(rel_reg_seq), "-".join(twochannel_seq)
        twochannel_reverse, twochannel_trunc_five = "-".join(twochannel_reverse), "-".join(twochannel_trunc_five)
        twochannel_trunc_ten = "-".join(twochannel_trunc_ten)

        # get metrics on movement between geographic regions
        reg_movs = pers_measures["num reg moves total"]
        reg_movs_first_ten = pers_measures["num reg moves first 10"]
        reg_movs_first_five = pers_measures["num reg moves first 5"]

        # update the person-measures-per-cohort lists
        chrt_time_to_event_dict[entry_yr]["ret times"].append(pers_measures["time to ret"])
        chrt_time_to_event_dict[entry_yr]["TB times"].append(pers_measures["time to tb"])
        chrt_time_to_event_dict[entry_yr]["CA times"].append(pers_measures["time to ca"])
        chrt_time_to_event_dict[entry_yr]["HC times"].append(pers_measures["time to hc"])
        chrt_time_to_event_dict[entry_yr]["geog move times"].append(pers_measures["time to first geog move"])

        person_row = [pid, entry_yr, gndr, len(person), began_at_low_court,
                      pers_measures["time to ret"], pers_measures["time to tb"], pers_measures["time to ca"],
                      pers_measures["time to hc"], pers_measures["time to first geog move"],
                      "", "", "", "", "",
                      reg_movs_first_five, reg_movs_first_ten, reg_movs,
                      hierar_seq, rel_reg_seq, twochannel_seq, twochannel_reverse,
                      twochannel_trunc_ten, twochannel_trunc_five]

        person_sequences_table.append(person_row)

    # for each person observation, mark down that person's cohort's time-to-event, if applicable
    update_average_time_to_event(person_sequences_table, chrt_time_to_event_dict)

    # write the person-sequence table to disk as a csv
    header = ["pid", "entry_yr", "gender", "career_length", "start_lc",
              "time_to_ret", "time_to_tb", "time_to_ca", "time_to_hc", "time_to_first_reg_move",
              "chrt_avg_time_ret", "chrt_avg_time_tb", "chrt_avg_time_ca",
              "chrt_avg_to_hc", "chrt_avg_time_first_reg_move",
              "reg_mov_first_5_yrs", "reg_mov_first_10_yrs", "reg_mov_total",
              "hierar_seq", "rel_reg_seq", "multi_seq", "rev_multi_seq", "multi_sec_10", "multi_seq_5"]

    # write the sequences data table to disk
    with open(outdir + "sequences_data_table.csv", 'w') as out_f:
        writer = csv.writer(out_f)
        writer.writerow(header)
        [writer.writerow(pers_seq) for pers_seq in person_sequences_table]

    # make the log file
    make_log(hierar_pool, rel_reg_pool, twochannel_pool, two_places_at_once, career_gaps, outdir)
Example #28
0
def avg_career_length(person_year_table, profession, area_sample=True):
    """
    Print out yearly, average, per-level career length, so we can answer questions like "did tribunal (i.e. level 2
    judges) become more experienced, on average, between 1995 and 2005?

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession:
    :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești
                        appeals areas/regions
    :return: None
    """

    # TODO if using the sample (and not whole population), need to top up estimates to account for the fact that
    #  some people enter from outside the sample, so it might look like it's their first year, but really they've had
    #  a longer career already

    if area_sample:
        appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]
        person_year_table = sample.appellate_area_sample(
            person_year_table, profession, appellate_areas_to_sample)

    # add a career length count for each person, for each year

    # group table by persons
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    person_year_table = sorted(person_year_table,
                               key=operator.itemgetter(pid_col_idx,
                                                       yr_col_idx))
    people = [
        person
        for k, [*person] in groupby(person_year_table,
                                    key=operator.itemgetter(pid_col_idx))
    ]

    # make an augmented table where the last year is the career length of that person, in that year
    # NB: by convention we 1-index, i.e. your career length is "1" in the first year for which we observe you
    # the career length column is the last one in the table
    augmented_table = []
    for person in people:
        for idx, pers_yr in enumerate(person):
            augmented_table.append(pers_yr + [idx])

    # for each year, get average career length per level
    years = sorted(list({int(py[yr_col_idx]) for py in augmented_table}))
    year_dict = {year: {"1": [], "2": [], "3": [], "4": []} for year in years}

    # sort and group augmented table by year
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')
    augmented_table.sort(key=operator.itemgetter(yr_col_idx))
    years = [
        yr for k, [*yr] in groupby(augmented_table,
                                   key=operator.itemgetter(yr_col_idx))
    ]
    for yr_group in years:
        # recall that a year-group is made of person-years, all sharing the same year, e.g. 1996
        current_year = int(yr_group[0][yr_col_idx])
        # build the per-level person-year lists for each year, in the year_dict
        for pers_yr in yr_group:
            py_lvl = pers_yr[lvl_col_idx]
            year_dict[current_year][py_lvl].append(int(pers_yr[-1]))

    # get the level average for each year
    for yr in year_dict:
        for lvl in year_dict[yr]:
            if year_dict[yr][lvl]:  # need to be careful, no lvl 3 before 1993
                year_dict[yr][lvl] = round(statistics.mean(year_dict[yr][lvl]),
                                           2)

    # print the results
    for yr in year_dict:
        print(yr, ' | ', year_dict[yr])
Example #29
0
def yearly_weights(person_year_table, profession, appellate_areas_to_sample,
                   weighting_year):
    """
    Get following weights (as ratios), per year, per level:
     - retirement / retire + leave area
     - internal promotion / total promotions
     - external promotion / total entries

    All counts are based on comparing the sampled appellate areas to the population in the other appellate areas.

    NB: these weights pool across sampled areas

    NB: keys in base-level dicts indicate judicial level: 1 = low court, 2 = tribunal, 3 = appeals, 4 = high court

    NB: by convention I turn undefined weights (where the denominator is zero) to zero

    NB: assumes weighting years feature entire population.

    :param person_year_table: a table of person-years, as a list of lists; comes with NO header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param appellate_areas_to_sample: list of appellate area codes indicating which areas we sample, e.g. ["CA1, "CA5"]
    :param weighting_year: year based on which we draw weights. NB: since we measure mobility by comparing this year
                           with adjacted ones (e.g. we know you got promoted because your level in weighting_year is
                           less than your level in weighting_year+1), weighting_year actually signifies an interval.
                           So "2006" refers to mobility in the period 2006-2007. Years are as str, e.g. "2017".
    :return: dict of yearly weights
    """

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')

    # make the dicts that hold mobility counts per level
    lvls_dict = {"1": 0, "2": 0, "3": 0}
    total_retirements, total_area_leaves = deepcopy(lvls_dict), deepcopy(
        lvls_dict)
    total_promotions, internal_promotions = deepcopy(lvls_dict), deepcopy(
        lvls_dict)
    total_entries, external_promotions = deepcopy(lvls_dict), deepcopy(
        lvls_dict)

    # group table by persons
    person_year_table = sorted(person_year_table,
                               key=operator.itemgetter(pid_col_idx,
                                                       yr_col_idx))
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    people = [
        person
        for k, [*person] in groupby(person_year_table,
                                    key=operator.itemgetter(pid_col_idx))
    ]

    # iterate through people
    for person in people:

        # iterate through person-years
        for idx, pers_yr in enumerate(person):

            if idx < 1:  # for the first year of the career; NB: this always assumes that we skip the left censor year

                # if first year is sampling year, and the person-year is in the sampling areas
                if pers_yr[yr_col_idx] == weighting_year and pers_yr[
                        ca_cod_idx] in appellate_areas_to_sample:
                    # increment total entries
                    total_entries[pers_yr[lvl_col_idx]] += 1

            elif 0 < idx < len(
                    person) - 1:  # look up to the second-last person-year

                # if this year is sampling year, and this person-year is in the sampling areas
                if pers_yr[yr_col_idx] == weighting_year and pers_yr[
                        ca_cod_idx] in appellate_areas_to_sample:

                    # if current appellate area is different from next year appellate area, increment total area leaves
                    if pers_yr[ca_cod_idx] != person[idx + 1][ca_cod_idx]:
                        total_area_leaves[pers_yr[lvl_col_idx]] += 1

                    # if current appellate area is different from last year's appellate area AND
                    # last year's level is lower than this year's level, increment external promotions
                    if pers_yr[ca_cod_idx] != person[idx + 1][ca_cod_idx] \
                            and person[idx - 1][lvl_col_idx] < pers_yr[lvl_col_idx]:
                        external_promotions[pers_yr[lvl_col_idx]] += 1

                    # if this year's level is lower than next year's level, increment total promotions
                    if pers_yr[lvl_col_idx] < person[idx + 1][lvl_col_idx]:
                        total_promotions[pers_yr[lvl_col_idx]] += 1

                        # if this year's level is lower than next year's
                        # AND this year's appellate area is the same as next years, increment internal promotions
                        if pers_yr[ca_cod_idx] == person[idx + 1][ca_cod_idx]:
                            internal_promotions[pers_yr[lvl_col_idx]] += 1

            else:  # we're in the last year, i.e. the retirement year
                # NB: this always assume we pick a sampling year than is less than the right censoring year

                # if last year is sampling year and in sampling areas, increment retirements counter
                if person[-1][yr_col_idx] == weighting_year and person[-1][
                        ca_cod_idx] in appellate_areas_to_sample:
                    total_retirements[person[-1][lvl_col_idx]] += 1

    # make retirement weights
    retirement_weights = {}
    for key in total_retirements:
        retirement_weights.update({
            key:
            helpers.weird_division(
                total_retirements[key],
                (total_area_leaves[key] + total_retirements[key]))
        })
    # make internal promotion weights
    internal_promotion_weights = {}
    for key in total_promotions:
        internal_promotion_weights.update({
            key:
            helpers.weird_division(internal_promotions[key],
                                   total_promotions[key])
        })

    # make external promotion weights
    external_promotion_weights = {}
    for key in total_entries:
        external_promotion_weights.update({
            key:
            helpers.weird_division(external_promotions[key],
                                   total_entries[key])
        })

    return {
        "ret_leave": retirement_weights,
        "int_prom": internal_promotion_weights,
        "ext_prom": external_promotion_weights
    }
def between_month_mobility(professions, season):
    """
    Compares within-year mobility between two months (typically April and June) across four years, to see if mobility
    in that interval differs significantly across years.

    NB: the unique identifiers here are full names, on the assumption that there is minimal spurious variance in one
    person's full name across a 3-4 month gap in one year (e.g. it's unlike they get married then and change their
    name in that interval).

    :param professions: dict where key is profession name and value is base path to month-level data table
    :param season: str, season for the months of which we analyse mobility
    :return: None
    """
    if season == 'spring-summer':
        sample_months_years = {'judges': {2020: (4, 6), 2019: (4, 6), 2018: (4, 6), 2017: (4, 6)},
                               'prosecutors': {2020: (4, 7), 2019: (4, 6), 2018: (5, 7), 2016: (4, 6)}}

    else:  # season == fall-winter
        sample_months_years = {'judges': {2019: (9, 12), 2017: (9, 12), 2016: (9, 12), 2015: (9, 12)},
                               'prosecutors': {2019: (9, 12), 2017: (9, 12), 2016: (9, 12), 2015: (9, 12)}}

    for prof, path in professions.items():
        # get person-month table
        with open(path, 'r') as in_file:
            pm_table = list(csv.reader(in_file))[1:]  # start from first index to skip header

        # get samples
        samples = {}
        for year, months in sample_months_years[prof].items():
            # get year_month sample
            samp = sample.mo_yr_sample(pm_table, prof, months, [year])

            # upgrade sample with workplace profile;
            wrk_plc_idx = helpers.get_header(prof, "collect").index("instanță/parchet")
            workplace_codes = workplace.get_workplace_codes(prof)
            samp_upgr = [p_mo + workplace.get_workplace_profile(p_mo[wrk_plc_idx], workplace_codes)
                         for p_mo in samp]  # p_mp means "person month"

            # something wrong with CA Craiova for judges, numbers are whack for 2019 and 2020
            # remove and see if patterns still hold
            if prof == 'judges':
                samp_upgr = [p_mo for p_mo in samp_upgr if p_mo[-4] != 'CA7']

            samples.update({year: samp_upgr})

            # NB: by this point the column headers would be (in this order):
            # ["nume", "prenume", "instanță/parchet", "an", "lună", "ca cod", "trib cod", "jud cod", "nivel"]

        # initialise the profession-specific dicts holding the descriptives; NB: get counts for "nivel" and "ca cod"
        mob_dict = {'entries': 0, 'exits': 0, 'up': 0, 'down': 0, 'across': 0}
        levels, cas = ['1', '2', '3', '4'], natsorted(list({p_mo[5] for p_mo in samples[2019]}))
        lvl_mob_dict = {year: {lvl: deepcopy(mob_dict) for lvl in levels} for year in samples}
        ca_mob_dict = {year: {ca: deepcopy(mob_dict) for ca in cas} for year in samples}

        for year, smpl in samples.items():
            first_mo, last_mo = sample_months_years[prof][year][0], sample_months_years[prof][year][1]

            # caclulate entries and exits between months, separately for level vs geographic area/agency
            fullnames_in_first_month = {p_mo[0] + ' | ' + p_mo[1] for p_mo in smpl if int(p_mo[4]) == first_mo}
            fullnames_in_second_month = {p_mo[0] + ' | ' + p_mo[1] for p_mo in smpl if int(p_mo[4]) == last_mo}

            # entries: unique full names in second month that aren't in first month
            entries = fullnames_in_second_month - fullnames_in_first_month
            # exits: unique full names in first month that aren't in second month
            exits = fullnames_in_first_month - fullnames_in_second_month

            # update the mobility dicts with entry and exit counts
            for p_mo in smpl:
                if p_mo[0] + ' | ' + p_mo[1] in entries:
                    lvl_mob_dict[year][p_mo[-1]]['entries'] += 1  # p_mo[-1] = level
                    ca_mob_dict[year][p_mo[-4]]['entries'] += 1  # p_mo[-4] = ca cod
                if p_mo[0] + ' | ' + p_mo[1] in exits:
                    lvl_mob_dict[year][p_mo[-1]]['exits'] += 1
                    ca_mob_dict[year][p_mo[-4]]['exits'] += 1

            # now for each sample calculate intermonth up, down, and across mobilities

            # group the table by person fullname, throw out all fullnames with anything besides two observations,
            # i.e. don't look at people we see only once, and people we see more than twice are errors (but count these
            # so I know how big of errors we're talking here)

            # make a new table with merged names, i.e. first column is the full name
            fn_table = [[p_mo[0] + ' | ' + p_mo[1]] + p_mo[2:] for p_mo in smpl]
            # NB: columns now ["fullname", "instanță/parchet", "an", "lună", "ca cod", "trib cod", "jud cod", "nivel"]

            # group table by persons, identified by unique full name; fullname is now at the zero index
            # sort table by fullname and month
            people = [person for k, [*person] in itertools.groupby(sorted(fn_table, key=operator.itemgetter(0, 3)),
                                                                   key=operator.itemgetter(0))]

            # throw out persons with only one obs (by definition can't experience within-system mobility) as well as
            # persons with more than two observations (e.g. very common names); these are both sources of measurement
            # error, so mark down how many we throw out of each type for our knowledge
            num_one_obs = len([1 for person in people if len(person) < 2])
            num_multi_obs = len([1 for person in people if len(person) > 2])
            print(prof.upper())
            print('  %s TOTAL PERSONS IN YEAR %s' % (len(people), year))
            print('  %s PERSONS WITH ONE OBSERVATION IN YEAR %s' % (num_one_obs, year))
            print('  %s PERSONS WITH THREE OR MORE OBSERVATIONS IN YEAR %s' % (num_multi_obs, year))
            print('\n')

            two_obs_persons = [person for person in people if len(person) == 2]

            # then for each person see if their level changed between the two months; update data dicts accordingly
            # by convention, the level/ca cod for which we report mobility is level/ca cod of first month/period
            for person in two_obs_persons:
                # person[0] == month 1 obs, person[1] == month 2 obs; person[int][-1] == person month obs level

                # down mobility
                if int(person[0][-1]) < int(person[1][-1]):
                    lvl_mob_dict[year][person[0][-1]]['down'] += 1  # person[0][-1] = level
                    ca_mob_dict[year][person[0][-4]]['down'] += 1  # person[0][-3] == ca cod

                # up mobility
                if int(person[0][-1]) > int(person[1][-1]):
                    lvl_mob_dict[year][person[0][-1]]['up'] += 1  # person[0][-1] = level
                    ca_mob_dict[year][person[0][-4]]['up'] += 1  # person[0][-3] == ca cod

                # across mobility
                # person[int][1] == person month obs workplace name;
                if int(person[0][-1]) == int(person[1][-1]) and person[0][1] != person[1][1]:
                    lvl_mob_dict[year][person[0][-1]]['across'] += 1  # person[0][-1] = level
                    ca_mob_dict[year][person[0][-4]]['across'] += 1  # person[0][-3] == ca cod

        # now make one profession-leve table per mobility types
        mobility_dicts = {'levels_mobility': lvl_mob_dict, 'ca_regions_mobility': ca_mob_dict}
        ordered_years = sorted(list(lvl_mob_dict.keys()))
        mobility_types = list(mob_dict.keys())
        for mob_dict_name, mob_dict in mobility_dicts.items():

            descr_out_dir = root + 'conference_presentations/ecpr_2020/data/' + prof + '/' \
                            + 'descriptors/' + season + '/'
            mob_table_path = descr_out_dir + mob_dict_name + '.csv'
            with open(mob_table_path, 'w') as out_f:
                writer = csv.writer(out_f)

                for mob_type in mobility_types:
                    writer.writerow([prof.upper()])
                    writer.writerow(["INTER MONTH MOBILITY RATE: %s" % (mob_type.upper())])
                    writer.writerow(['unit/level'] + ordered_years)

                    units = cas if 'ca' in mob_dict_name else levels
                    for u in units:
                        data_row = [u]

                        for year in ordered_years:
                            data_row.append(mob_dict[year][u][mob_type])

                        writer.writerow(data_row)
                    writer.writerow('\n')