Example #1
0
def main(fhs_folder, csv_output, progress=_print_progress):
    """Extract the salaries information from FHS and bucketize them.

    In order to avoid issues about jobseekers being counted several times, we
    only consider salaries for jobseekers that are still unemployed at the time
    the FHS was sampled.

    Args:
        fhs_folder: path of the root folder of the FHS files.
        csv_output: path to the file to write to.
    """
    # TODO: Factorize this code with fhs_job_frequency.

    # Check that the output file is writeable before starting the long process
    # of collecting data.
    with open(csv_output, 'w'):
        pass

    de_rows = migration_helpers.flatten_iterator(
        path.join(fhs_folder, '*/de_*.csv'))

    # Estimation of the total # of rows in the FHS "de" table.
    total = 7000001

    job_seeker_counts = collections.defaultdict(int)
    counted = 0
    for de_dict in de_rows:
        counted += 1
        if counted % 10000 == 0 and progress:
            progress(counted, total)
        # Discard historical job requests, only work on the ones that are still
        # open.
        if de_dict[_END_DATE_FIELD]:
            continue
        job_seeker_counts[job_seeker_criteria(de_dict)] += 1

    _print_progress(total, total)

    with open(csv_output, 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow((
            'code_rome',
            'departement_id',
            'salary_unit',
            'salary_low',
            'salary_high',
            'count'))
        for key, count in job_seeker_counts.items():
            writer.writerow(key + (count,))
def main(fhs_folder, json_output, progress=_print_progress):
    """Extract the job OGR codes from FHS and count them.

    Args:
        fhs_folder: path of the root folder of the FHS files.
        json_output: path to the file to write to.
    """
    # Check that the output file is writeable before starting the long process
    # of collecting data.
    with open(json_output, 'w'):
        pass

    de_rows = migration_helpers.flatten_iterator(
        path.join(fhs_folder, '*/de_*.csv'))

    # Estimation of the total # of rows in the FHS "de" table.
    total = 7000001

    # If we need to do that often we could replace this code by a simple Map
    # Reduce to use multiple threads or multiple computers.
    job_counts = collections.defaultdict(int)
    counted = 0
    for de_dict in de_rows:
        counted += 1
        if counted % 1000 == 0 and progress:
            progress(counted, total)
        if de_dict[_END_DATE_FIELD]:
            continue
        job_code = de_dict[_JOB_CODE_FIELD]
        job_counts[job_code] += 1

    _print_progress(total, total)

    job_count_series = pandas.Series(job_counts)
    # Add random gaussian noise so that numbers do not reveal initial data.
    job_count_series = job_count_series.add(
        numpy.random.normal(scale=5, size=len(job_count_series)))
    # Keep int only to hide the fact we added noise.
    job_count_series = job_count_series.round().astype(int)
    # Keep strictly positive values only.
    job_count_series = job_count_series[job_count_series > 0]
    # Return to dict format and get rid of numpy int64 (TODO find a cleaner way
    # to do that).
    job_counts = json.loads(job_count_series.to_json())

    with open(json_output, 'w') as output_file:
        json.dump(job_counts, output_file, sort_keys=True, indent=2)
Example #3
0
 def _table_iterator(table):
     return PeekIterator(
         migration_helpers.flatten_iterator(
             path.join(fhs_folder, '*/{}.csv'.format(table))))