Python pop_cohort_countsの例

プログラミング言語: Python

名前空間/パッケージ名: describe.totals_in_out

メソッド/関数: pop_cohort_counts

hotexamples.comのコード掲載数: 9

Python pop_cohort_counts - 9件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdescribe.totals_in_out.pop_cohort_countsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def estimated_population_size(person_year_table, areas_sample_table,
                              profession, sampling_year_range):
    """
    Estimate the total size of the profession for the years in sampling_year_range. Assumes that the ratio between the
    sum of the sampled appeals areas and the sum of the rest of the areas remained constant over the entire data period.

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param areas_sample_table: a table of person-years within the sampled areas, as a list of lists; assumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004)
    :return: dict of estimated population sizes: keys are years, value are estimates
    """

    # get ratios of sample size to population size for for 2006, 2007, 2008.
    samp_size = totals_in_out.pop_cohort_counts(areas_sample_table,
                                                1978,
                                                2020,
                                                profession,
                                                cohorts=False,
                                                unit_type="nivel")
    pop_size = totals_in_out.pop_cohort_counts(person_year_table,
                                               1978,
                                               2020,
                                               profession,
                                               cohorts=False,
                                               unit_type="nivel")
    ratio_06 = samp_size["grand_total"][2006]["total_size"] / pop_size[
        "grand_total"][2006]["total_size"]
    ratio_07 = samp_size["grand_total"][2007]["total_size"] / pop_size[
        "grand_total"][2007]["total_size"]
    ratio_08 = samp_size["grand_total"][2008]["total_size"] / pop_size[
        "grand_total"][2008]["total_size"]

    # average these ratios (across the three years)
    samp_to_pop_ratios = [ratio_06, ratio_07, ratio_08]
    avg_samp_to_pop_ratio = statistics.mean(samp_to_pop_ratios)

    # for each year, multiply the number of people in sample by reciprocal of sample size / population ratio: these are
    # the population estimate for the years in which we only have samples
    estim_pop = {}
    for year in samp_size["grand_total"]:
        if sampling_year_range[0] <= int(year) <= sampling_year_range[1]:
            estim_pop.update({
                str(year):
                round(
                    float(samp_size["grand_total"][year]["total_size"]) /
                    float(avg_samp_to_pop_ratio))
            })
    return estim_pop

コード例 #2

ファイルを表示

def entry_exit_unit_table(person_year_table,
                          start_year,
                          end_year,
                          profession,
                          unit_type,
                          out_dir,
                          entry=True):
    """
    Make two csv tables, where the rows are units and the columns are years. For one table the cells are
    the total number of departures from that unit, for that year; for the other table, the cells are the percent
    of departures, relative to all people in that unit, in that year.

    NB: units can be geographic regions (e.g. notary "camera") or hierarchical level (e.g. tribunal level for judges)

    :param person_year_table: a table of person-years, as a list of lists
    :param start_year: int, the first year we consider
    :param end_year: int, the last year we consider
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param unit_type: string, type of the unit as it appears in header of person_year_table (e.g. "camera")
    :param entry: bool, True if entry cohorts, False if exit cohorts (i.e. everyone who left in year X)
    :param out_dir: directory where the table will live
    :return: None
    """
    # if we look at entry cohorts avoid left censor and include right censor (which function ignores by default)
    if entry:
        start_year += 1
        end_year += 1

    # get data on cohorts by year and unit
    cohorts_per_unit = totals_in_out.pop_cohort_counts(person_year_table,
                                                       start_year,
                                                       end_year,
                                                       profession,
                                                       cohorts=True,
                                                       unit_type=unit_type,
                                                       entry=entry)
    # write the table to disk
    type_of_cohort = 'entry' if entry else 'departure'
    out_path = out_dir + profession + '_' + type_of_cohort + '_' + unit_type + '_rates.csv'
    with open(out_path, 'w') as o_file:
        fieldnames = ['unit'] + list(
            range(start_year,
                  end_year))  # omit last year: all leave in right censor year
        writer = csv.DictWriter(o_file, fieldnames=fieldnames)
        writer.writeheader()

        # iterate over units
        for unit, years in cohorts_per_unit.items():
            percent_row = {'unit': unit}
            count_row = {'unit': ''}
            # iterate over the years:
            for year, measures in years.items():
                if start_year <= int(
                        year) <= end_year - 1:  # stay within bounds
                    percent_row.update({year: measures['chrt_prcnt_of_pop']})
                    count_row.update(
                        {year: '(' + str(measures['total_size']) + ')'})
            # display the count row under the percent row
            writer.writerow(percent_row)
            writer.writerow(count_row)

コード例 #3

ファイルを表示

def estimated_population_size(person_year_table, profession):
    """
    To estimate the total size of the profession (i.e. of the population) for years in which we only have a sample,
    estimate the ratio between population and samples size for years in which we DO have the whole population, then
    for years with samples only multiply this population inflation ratio by the observed sample size. To be exact

      population inflation ratio = population size / sample size

      estimates population size = sample size * population inflation ratio

    NB: this assumes that the ratio between the total population and the sum of the sampled areas is roughly constant
        across the years whose population sizes we're estimating.

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :return: dict of estimated population sizes: keys are years, value are estimates
    """

    samp_yrs, samp_as, ratio_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]
    pop_yrs, total_yrs = pop_yr_range[profession], total_range[profession]
    areas_sample_table = sample.appellate_area_sample(person_year_table, profession, samp_as)

    # get ratio of sample size to population size for desired years, average across said years
    samp_size = totals_in_out.pop_cohort_counts(areas_sample_table, total_yrs[0], total_yrs[1], profession,
                                                cohorts=False, unit_type="nivel")
    pop_size = totals_in_out.pop_cohort_counts(person_year_table, pop_yrs[0], pop_yrs[1], profession,
                                               cohorts=False, unit_type="nivel")
    avg_samp_size, avg_total_size = 0, 0
    for r_yr in ratio_yrs:
        avg_samp_size += samp_size["grand_total"][r_yr]["total_size"]
        avg_total_size += pop_size["grand_total"][r_yr]["total_size"]
    avg_samp_size = float(avg_samp_size) / float(len(ratio_yrs))
    avg_total_size = float(avg_total_size) / float(len(ratio_yrs))
    pop_inflation_ratio = avg_total_size / avg_samp_size

    # for each year in which we only have samples, multiply the number of people in sample by the population inflation
    # ratio; these are the population estimates for the years in which we only have samples. Round to 4 decimals.
    estim_pop = {}
    for yr in range(samp_yrs[0], samp_yrs[1] + 1):
        estim_pop.update({yr: round(float(samp_size["grand_total"][yr]["total_size"] * pop_inflation_ratio), 4)})
    return estim_pop

コード例 #4

ファイルを表示

def adjusted_retirement_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of retirement is that it does not distinguish between people who genuinely
    leave the profession and those who simply leave the sample (i.e. move to another area) but remain in the profession.
    Consequently, raw sample retirement counts are biased upwards because profession-exits and sample-exits are
    implicitly equated.

    The solution is to use the total population to compute the fraction of retirements from the sample area that are
    genuine departures from the profession and then to multiply the raw sample retirement count by that fraction,
    thereby reducing the upward bias. To be exact, the genuine retirement fraction is computed by

      genuine retirement fraction = genuine retirement counts / (genuine retirement counts + sample-leaving counts)

    and the adjusted retirement count will therefore be

      adjusted number of retirements = raw sample retirement count * genuine retirement fraction

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted retirement counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    ret_fracts = {lvl: {"gen_rets": 0, "samp_leaves": 0} for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]
            # if this year is used for the fraction, and within the sampling areas
            if int(current_yr) in fracts_yrs and current_area in samp_as:
                if idx < len(person) - 1:  # since we do look-aheads to see departures-cum-retirements
                    # if next year's area is NOT within the sampling area, increment sample departures
                    if person[idx + 1][ca_cod_idx] not in samp_as:
                        ret_fracts[current_lvl]["samp_leaves"] += 1

                # if last year is used for the fraction and within the sampling areas, increment genuine retirements
                else:  # NB: this always assume we pick a sampling year than is less than the right censoring year
                    ret_fracts[current_lvl]["gen_rets"] += 1

    # average over the years then get the final fraction, per level
    for lvl in ret_fracts:
        avg_gen_rets = float(ret_fracts[lvl]["gen_rets"]) / float(len(fracts_yrs))
        avg_samp_leave_rets = float(ret_fracts[lvl]["samp_leaves"]) / float(len(fracts_yrs))
        ret_fracts[lvl] = helpers.weird_division(avg_gen_rets, (avg_gen_rets + avg_samp_leave_rets), mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_ret_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession,
                                                      cohorts=True, unit_type="nivel", entry=False)
    samp_ret_counts.pop("grand_total")  # don't need the grand total

    # and weigh them; round result to four decimals
    for lvl in samp_ret_counts:
        for yr in samp_ret_counts[lvl]:
            samp_ret_counts[lvl][yr] = round(samp_ret_counts[lvl][yr]["total_size"] * ret_fracts[int(lvl)], 4)

    if weights:
        return ret_fracts
    else:
        return samp_ret_counts

コード例 #5

ファイルを表示

def adjusted_entry_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of entries is that it does not distinguish between people who are genuinely
    new recruits to the profession, and those who were already in the profession but outside the sample. Consequently,
    the raw count is biased upwards because it equates entering the sample from within the profession with entering
    the profession tout-court.

    The solution is to use the total population to compute the fraction of entries into the sample that are genuine
    recruits into the profession and then to multiply the raw sample entry count by that fraction, thereby reducing the
    upward bias. To be exact, the genuine entry fraction is computed by

      genuine entry fraction = genuine entry counts / (genuine entry counts + sample-entering counts)

    and the adjusted entry count will therefore be

      adjusted number entries = sample entry count * genuine entry fraction

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted entry counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    ent_fracts = {lvl: {"gen_ents": 0, "samp_ents": 0} for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]

            # if this year is used for the fraction and this year is within the sample area
            if int(current_yr) in fracts_yrs and current_area in samp_as:

                # if it's genuinely the first year, increment genuine entries
                #  NB: this always assumes that we skip the left censor year
                if idx == 0:  # the first year of the career;
                    ent_fracts[current_lvl]["gen_ents"] += 1

                if 1 < idx:  # since we do look-behinds to see if someone entered the sample from elsewhere

                    # if LAST year's appellate area is different from this year's appellate area, increment count of
                    # extra-sample entries
                    if current_area != person[idx - 1][ca_cod_idx]:
                        ent_fracts[current_lvl]["samp_ents"] += 1
    # average over the years then get the final fraction, per level
    for lvl in ent_fracts:
        avg_gen_ents = float(ent_fracts[lvl]["gen_ents"]) / float(len(fracts_yrs))
        avg_samp_ents = float(ent_fracts[lvl]["samp_ents"]) / float(len(fracts_yrs))
        ent_fracts[lvl] = helpers.weird_division(avg_gen_ents, (avg_gen_ents + avg_samp_ents), mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_ent_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession,
                                                      cohorts=True, unit_type="nivel", entry=True)
    samp_ent_counts.pop("grand_total")  # don't need the grand total
    # and weigh them; round result to four decimals
    for lvl in samp_ent_counts:
        for yr in samp_ent_counts[lvl]:
            samp_ent_counts[lvl][yr] = round(samp_ent_counts[lvl][yr]["total_size"] * ent_fracts[int(lvl)], 4)

    if weights:
        return ent_fracts
    else:
        return samp_ent_counts

コード例 #6

ファイルを表示

def get_raw_counts(person_year_table, profession, sampling_year_range):
    """
    Get counts, in different years/intervals, of: number of professionals, entries, retirements, promotions.
    Keep only data for years in the year_range

    :param person_year_table: a table of person-years, as a list of lists; comes with header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004)
    :return: dict of dicts, where top-level dicts indicate nature of count (e.g. "retirements") and bottom level
             dict shows count of retirements per judicial level (e.g. lvl2, i.e. tribunals)
    """

    total_counts = totals_in_out.pop_cohort_counts(person_year_table,
                                                   1978,
                                                   2020,
                                                   profession,
                                                   cohorts=False,
                                                   unit_type="nivel")

    entries = totals_in_out.pop_cohort_counts(person_year_table,
                                              1978,
                                              2020,
                                              profession,
                                              cohorts=True,
                                              unit_type="nivel",
                                              entry=True)

    retirements = totals_in_out.pop_cohort_counts(person_year_table,
                                                  1978,
                                                  2020,
                                                  profession,
                                                  cohorts=True,
                                                  unit_type="nivel",
                                                  entry=False)

    promotions = hierarchical.hierarchical_mobility(person_year_table,
                                                    profession)

    years = list(promotions.keys())

    # keep only level data for years within the specified year range
    retirements.pop("grand_total"), total_counts.pop("grand_total")

    for yr in years:
        # toss out extraneous years for promotions
        if int(yr) < sampling_year_range[0] or int(
                yr) > sampling_year_range[1]:
            if yr in promotions:
                promotions.pop(yr)

        # toss out extraneous years for retirements and total counts
        for key in retirements:
            if int(yr) < sampling_year_range[0] or int(
                    yr) > sampling_year_range[1]:
                if int(yr) in retirements[key]:
                    entries[key].pop(int(yr))
                    retirements[key].pop(int(yr))
                    total_counts[key].pop(int(yr))

    # for entries retirements and total counts, only keep total size for that year
    # keys of promotion are the years, apply to all dicts
    ent_counts = {year: {"1": 0, "2": 0, "3": 0} for year in promotions.keys()}
    ret_counts = {year: {"1": 0, "2": 0, "3": 0} for year in promotions.keys()}
    tot_counts = {year: {"1": 0, "2": 0, "3": 0} for year in promotions.keys()}
    for lvl in retirements:
        for year in retirements[lvl]:
            ent_counts[str(year)][lvl] = entries[lvl][year]["total_size"]
            ret_counts[str(year)][lvl] = retirements[lvl][year]["total_size"]
            tot_counts[str(year)][lvl] = total_counts[lvl][year]["total_size"]

    # for promotions
    prom_counts = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in promotions.keys()
    }
    for year in promotions:
        for lvl in promotions[year]:
            prom_counts[year][str(lvl)] = promotions[year][lvl]["up"]["total"]

    return {
        "entries": ent_counts,
        "retirements": ret_counts,
        "promotions": prom_counts,
        "total counts": tot_counts
    }

コード例 #7

ファイルを表示

def entry_exit_gender(person_year_table,
                      start_year,
                      end_year,
                      profession,
                      out_dir,
                      entry=True,
                      unit_type=None):
    """
    Make a table that shows the count and percentage of entry and exit cohorts for each gender, and for each
    unit if applicable.

    :param person_year_table: a table of person-years, as a list of lists
    :param start_year: int, the first year we consider
    :param end_year: int, the last year we consider
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param out_dir: directory where the table will live
    :param unit_type: None, or if provided, a string indicating the type of unit (e.g. appellate court region)
    :param entry: bool, True if entry cohorts, False if exit cohorts (i.e. everyone who left in year X)
    :return: None
    """

    type_of_cohort = 'entry' if entry else 'departure'

    if unit_type:
        out_path = out_dir + profession + '_' + unit_type + '_' + type_of_cohort + '_cohorts_gender.csv'
        fieldnames = ["unit"] + ["year"] + [
            "female", "male", "don't know", "total count", "percent female"
        ]
        cohorts = totals_in_out.pop_cohort_counts(person_year_table,
                                                  start_year,
                                                  end_year,
                                                  profession,
                                                  cohorts=True,
                                                  unit_type=unit_type,
                                                  entry=entry)
    else:
        out_path = out_dir + profession + '_' + type_of_cohort + '_cohorts_gender.csv'
        fieldnames = ["year"] + [
            "female", "male", "don't know", "total count", "percent female"
        ]
        cohorts = totals_in_out.pop_cohort_counts(person_year_table,
                                                  start_year,
                                                  end_year,
                                                  profession,
                                                  cohorts=True,
                                                  unit_type=None,
                                                  entry=entry)

    # write table to disc
    with open(out_path, 'w') as o_file:
        writer = csv.DictWriter(o_file, fieldnames=fieldnames)
        writer.writeheader()

        # if we're given unit types
        if unit_type:
            # iterate over units
            for unit, years in cohorts.items():
                if unit != 'grand_total':
                    # iterate over the years:
                    for year, metrics in years.items():
                        if start_year <= int(
                                year) <= end_year - 1:  # stay within bounds
                            writer.writerow({
                                "unit":
                                unit,
                                "year":
                                year,
                                "female":
                                metrics['f'],
                                "male":
                                metrics["m"],
                                "don't know":
                                metrics['dk'],
                                "total count":
                                metrics['total_size'],
                                "percent female":
                                metrics['percent_female']
                            })

        else:  # no units, just straight years
            for year, metrics in cohorts['grand_total'].items():
                writer.writerow({
                    "year": year,
                    "female": metrics['f'],
                    "male": metrics["m"],
                    "don't know": metrics['dk'],
                    "total count": metrics['total_size'],
                    "percent female": metrics['percent_female']
                })

コード例 #8

ファイルを表示

def year_counts_table(person_year_table,
                      start_year,
                      end_year,
                      profession,
                      out_dir,
                      unit_type=None):
    """
    Makes a table of yearly population counts, and optionally breaks down total counts by unit_type.

    :param person_year_table: a table of person-years, as a list of lists
    :param start_year: int, the first year we consider
    :param end_year: int, the last year we consider
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param out_dir: directory where the table will live
    :param unit_type: None, or if provided, a string indicating the type of unit (e.g. appellate court region)
    :return: None
    """

    if unit_type:
        out_path = out_dir + profession + '_' + unit_type + '_year_totals.csv'
        fieldnames = ["unit"] + ["year"] + [
            "female", "male", "don't know", "total count", "percent female"
        ]
        year_metrics = totals_in_out.pop_cohort_counts(person_year_table,
                                                       start_year,
                                                       end_year,
                                                       profession,
                                                       cohorts=False,
                                                       unit_type=unit_type)
    else:
        out_path = out_dir + profession + '_year_totals.csv'
        fieldnames = ["year"] + [
            "female", "male", "don't know", "total count", "percent female"
        ]
        year_metrics = totals_in_out.pop_cohort_counts(person_year_table,
                                                       start_year,
                                                       end_year,
                                                       profession,
                                                       cohorts=False)

    # make table and write to disk
    with open(out_path, 'w') as o_file:
        writer = csv.DictWriter(o_file, fieldnames=fieldnames)
        writer.writeheader()

        if unit_type:
            # iterate over units
            for unit, years in year_metrics.items():
                if unit != 'grand_total':
                    # iterate over years:
                    for year, metrics in years.items():
                        if start_year <= int(
                                year) <= end_year:  # stay within bounds
                            writer.writerow({
                                "unit":
                                unit,
                                "year":
                                year,
                                "female":
                                metrics['f'],
                                "male":
                                metrics["m"],
                                "don't know":
                                metrics['dk'],
                                "total count":
                                metrics['total_size'],
                                "percent female":
                                metrics['percent_female']
                            })

        else:  # no units, just straight years
            for year, metrics in year_metrics['grand_total'].items():
                writer.writerow({
                    "year": year,
                    "female": metrics['f'],
                    "male": metrics["m"],
                    "don't know": metrics['dk'],
                    "total count": metrics['total_size'],
                    "percent female": metrics['percent_female']
                })

        # finally, show which appeals and tribunal areas were sampled

        ca_col_idx = helpers.get_header(profession,
                                        'preprocess').index('ca cod')
        trib_col_idx = helpers.get_header(profession,
                                          'preprocess').index('trib cod')

        ca_areas = sorted(list({py[ca_col_idx] for py in person_year_table}))
        tb_areas = sorted(list({py[trib_col_idx] for py in person_year_table}))

        writer.writerow({"year": ''})
        writer.writerow({
            "year": "SAMPLED COURT OF APPEALS AREAS",
            "female": ca_areas
        })
        writer.writerow({"year": "SAMPLED TRIBUNAL AREAS", "female": tb_areas})

コード例 #9

ファイルを表示

ファイル: hierarchical.py プロジェクト: r-parvulescu/ro_judicial_professions

def make_vacancy_transition_tables(person_year_table, profession, out_dir, years, averaging_years=None, area_samp=False,
                                   out_dir_area_samp=None):
    """
    Make a csv containing one sub-table for each of the years that we select, with each sub-table showing the transition
    probabilites between hiearchical levels of vacancies. Optionally, we may also include a table that averages across
    desired years. e.g. 1984-1989.

    Each sub-table should be NxN+1, where N = number of levels, and the last column represents vacancies leaving the
    system, i.e. people being recruited into the system.

    NB: diagonals signify mobility WITHIN the level

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param out_dir: str, the path to where the transition matrices will live
    :param years: list of ints, the years for which we want vacancy probability transition matrixes
    :param averaging_years: list of ints over which we want to average vacancy frequency tables, e.g. [1985, 1986, 1987]
    :param area_samp: bool,True if we want to sample from specific areas
    :param out_dir_area_samp: if given, str showing the out-directory where we want the vacancy transition tables for
                              the sample areas to live
    :return: None
    """
    averaging_years = averaging_years if averaging_years else []  # if no averaging years provided, make empty list
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    proms_weights, demos_weights, transfs_weights = None, None, None  # throws up errors if things go awry

    # get entry counts, in easy format
    entry_counts = totals_in_out.pop_cohort_counts(sorted_person_year_table, years[0], years[-1], profession,
                                                   cohorts=True, unit_type="nivel", entry=True)
    entry_counts.pop("grand_total")  # don't need the grand total
    for lvl in entry_counts:
        for yr in entry_counts[lvl]:
            entry_counts[lvl][yr] = entry_counts[lvl][yr]["total_size"]

    if area_samp:
        # I hard code these in since they change so rarely
        samp_areas = {"judges": ["CA1", "CA7", "CA9", "CA12", "-88"], "prosecutors": []}
        samp_yr_range = {"judges": [1980, 2003], "prosecutors": []}
        samp_yrs, samp_as = samp_yr_range[profession], samp_areas[profession]

        # get sample-adjusted entry counts and sample weights for mobility
        entry_counts = area_samples.adjusted_entry_counts(person_year_table, profession)
        proms_weights = area_samples.adjusted_promotion_counts(sorted_person_year_table, profession, weights=True)
        demos_weights = area_samples.adjusted_demotion_counts(sorted_person_year_table, profession, weights=True)
        transfs_weights = area_samples.adjusted_lateral_transfer_counts(sorted_person_year_table, profession,
                                                                        weights=True)
        # restrict person-year table to sampling areas
        sorted_person_year_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
        # redirect the out-directory
        out_dir = out_dir_area_samp

    # get person-level transition frequencies levels
    trans_freqs = inter_level_transition_matrices(sorted_person_year_table, profession)

    with open(out_dir + "vacancy_probability_transition_matrixes.csv", "w") as out_f:
        writer = csv.writer(out_f)

        # this is unused if averaging years stays empty
        avg_vac_trans_mat = np.empty((4, 5), float)

        # for each sampling year
        for yr in years:

            # make array of zeros, for four levels; not all years have four levels, but zero rows/columns are harmless
            trans_mat = np.zeros((4, 4))

            for lvl in range(1, 5):  # for departure levels in the system, i.e. the level FROM which mobility happens
                if lvl in trans_freqs[yr]:  # if the levels exist in that year (since some are added later)

                    # now weigh the observed values
                    # NB: route = mobility route, e.g. "1-2" means "mobility from level 1 to level 2"
                    for route, mob_freq in trans_freqs[yr][lvl].items():

                        # ignore retirements, non-movements, sums, and discontinuities
                        if route.split("-")[1].isdigit():

                            # level you leave and level you go to; -1 since numpy zero indexes
                            departing, arriving = int(route.split("-")[0]) - 1, int(route.split("-")[1]) - 1

                            # get frequency counts and put them in the frequency matrix; if sampling, weigh the counts
                            if departing < arriving:  # promotions
                                trans_mat[departing][arriving] = mob_freq
                                if area_samp:
                                    trans_mat[departing][arriving] = round(mob_freq * proms_weights[lvl], 5)

                            if departing == arriving:  # lateral transfers
                                trans_mat[departing][arriving] = mob_freq
                                if area_samp:
                                    trans_mat[departing][arriving] = round(mob_freq * transfs_weights[lvl], 5)

                            if departing > arriving:  # demotions
                                trans_mat[departing][arriving] = mob_freq
                                if area_samp:
                                    trans_mat[departing][arriving] = round(mob_freq * demos_weights[lvl], 5)

            # transpose the person-level mobility frequency matrix to get the vacancy mobility matrix
            vac_trans_mat = np.transpose(trans_mat)

            # by convention, we thus far treated levels in incrementing order, i.e. level 1 < 2 < 3 < 4. The convention
            # in vacancy chains studies is that 1 > 2 > 3 > 4, and to get that we transpose the array along the
            # anti-diagonal/off-diagonal
            vac_trans_mat = vac_trans_mat[::-1, ::-1].T

            # in the last column we put vacancy "retirements", i.e. entries of people into the system

            entry_freqs = [entry_counts[str(level)][yr] for level in range(1, 5) if str(level) in entry_counts]
            entries_col = np.asarray(entry_freqs[::-1])[..., None]  # give it Nx1 shape; reverse order for 1 > 2 > 3...
            vac_trans_mat = np.append(vac_trans_mat, entries_col, 1)

            if yr in averaging_years:
                avg_vac_trans_mat = np.add(avg_vac_trans_mat, vac_trans_mat)

            vac_prob_mat = freq_mat_to_prob_mat(vac_trans_mat.tolist(), round_to=5)
            # add that transition probability matrix to table
            writer.writerow([profession.upper(), yr])
            header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"]
            writer.writerow(header)
            for i in range(len(vac_prob_mat)):
                writer.writerow([header[1:][i]] + vac_prob_mat[i])
            writer.writerow(["\n"])

        if averaging_years:
            avg_vac_trans_mat = np.divide(avg_vac_trans_mat, float(len(averaging_years) - 1))
            avg_vac_prob_mat = freq_mat_to_prob_mat(avg_vac_trans_mat.tolist(), round_to=5)
            writer.writerow(["AVERAGED ACROSS YEARS"] + averaging_years)
            header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"]
            writer.writerow(header)
            for i in range(len(avg_vac_prob_mat)):
                writer.writerow([header[1:][i]] + avg_vac_prob_mat[i])