def main(fhs_folder, now, mode_name, csv_output):
    """Extract the salaries information from FHS and bucketize them.

    Args:
        fhs_folder: path of the root folder of the FHS files.
        now: the date at which the FHS data was extracted, e.g. 2015-12-31.
        mode_name: the mode of extraction, see _MODES.
        csv_output: path to the file to write to.
    """
    if mode_name not in _MODES:
        raise ValueError('Unsupported mode: [%s], want one of [%s]' %
                         (mode_name, _MODES.keys()))
    mode = _MODES[mode_name]
    now = datetime.datetime.strptime(now, '%Y-%m-%d').date()

    job_seekers = fhs.job_seeker_iterator(
        fhs_folder, (fhs.UNEMPLOYMENT_PERIOD_TABLE, fhs.PART_TIME_WORK_TABLE))

    # Estimation of the total # of job seekers in the FHS.
    total = 2522364

    with open(csv_output, 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(_CRITERIA_HEADERS)
        for job_seeker in tqdm.tqdm(job_seekers, total=total):
            for row in job_seeker_rows(job_seeker, now, mode.categories,
                                       mode.only_last):
                writer.writerow(list(row))
예제 #2
0
    def test_job_seeker_iterator(self, mock_flatten_iterator):
        """Basic usage of job_seeker_iterator."""
        def _flatten_iterator(filename):
            if '/de_' in filename:
                return iter([
                    {
                        'IDX': '1',
                        'ROME': 'foo',
                        'DATINS': datetime.date(2015, 12, 1),
                        '__file__': filename.replace('*', 'Reg01'),
                    },
                    {
                        'IDX': '15',
                        'ROME': 'foo',
                        'DATINS': datetime.date(2015, 12, 1),
                        '__file__': filename.replace('*', 'Reg01'),
                    },
                    {
                        'IDX': '2',
                        'ROME': 'foo',
                        'DATINS': datetime.date(2015, 12, 1),
                        '__file__': filename.replace('*', 'Reg21'),
                    },
                ])
            if '/e0_' in filename:
                return iter([
                    {
                        'IDX': '1',
                        'HOURS': 42,
                        'MOIS': '201510',
                        '__file__': filename.replace('*', 'Reg01'),
                    },
                    {
                        'IDX': '1',
                        'HOURS': 43,
                        'MOIS': '201510',
                        '__file__': filename.replace('*', 'Reg01'),
                    },
                    {
                        'IDX': '2',
                        'HOURS': 27,
                        'MOIS': '201510',
                        '__file__': filename.replace('*', 'Reg21'),
                    },
                ])
        mock_flatten_iterator.side_effect = _flatten_iterator

        seekers = list(
            fhs.job_seeker_iterator('/folder/path/', tables=('de', 'e0')))
        data = [j._data for j in seekers]  # pylint: disable=protected-access
        self.assertEqual([
            {
                'IDX': '1',
                'de': [{
                    'IDX': '1',
                    'ROME': 'foo',
                    'DATINS': datetime.date(2015, 12, 1),
                    '__file__': '/folder/path/Reg01/de_Reg01.csv',
                }],
                'e0': [
                    {
                        'IDX': '1',
                        'HOURS': 42,
                        'MOIS': '201510',
                        '__file__': '/folder/path/Reg01/e0_Reg01.csv',
                    },
                    {
                        'IDX': '1',
                        'HOURS': 43,
                        'MOIS': '201510',
                        '__file__': '/folder/path/Reg01/e0_Reg01.csv',
                    },
                ],
            },
            {
                'IDX': '15',
                'de': [{
                    'IDX': '15',
                    'ROME': 'foo',
                    'DATINS': datetime.date(2015, 12, 1),
                    '__file__': '/folder/path/Reg01/de_Reg01.csv',
                }],
                'e0': [],
            },
            {
                'IDX': '2',
                'de': [{
                    'IDX': '2',
                    'ROME': 'foo',
                    'DATINS': datetime.date(2015, 12, 1),
                    '__file__': '/folder/path/Reg21/de_Reg21.csv',
                }],
                'e0': [{
                    'IDX': '2',
                    'HOURS': 27,
                    'MOIS': '201510',
                    '__file__': '/folder/path/Reg21/e0_Reg21.csv',
                }],
            },
        ], data)