def main(fhs_folder, now, mode_name, csv_output): """Extract the salaries information from FHS and bucketize them. Args: fhs_folder: path of the root folder of the FHS files. now: the date at which the FHS data was extracted, e.g. 2015-12-31. mode_name: the mode of extraction, see _MODES. csv_output: path to the file to write to. """ if mode_name not in _MODES: raise ValueError('Unsupported mode: [%s], want one of [%s]' % (mode_name, _MODES.keys())) mode = _MODES[mode_name] now = datetime.datetime.strptime(now, '%Y-%m-%d').date() job_seekers = fhs.job_seeker_iterator( fhs_folder, (fhs.UNEMPLOYMENT_PERIOD_TABLE, fhs.PART_TIME_WORK_TABLE)) # Estimation of the total # of job seekers in the FHS. total = 2522364 with open(csv_output, 'w') as csv_file: writer = csv.writer(csv_file) writer.writerow(_CRITERIA_HEADERS) for job_seeker in tqdm.tqdm(job_seekers, total=total): for row in job_seeker_rows(job_seeker, now, mode.categories, mode.only_last): writer.writerow(list(row))
def test_job_seeker_iterator(self, mock_flatten_iterator): """Basic usage of job_seeker_iterator.""" def _flatten_iterator(filename): if '/de_' in filename: return iter([ { 'IDX': '1', 'ROME': 'foo', 'DATINS': datetime.date(2015, 12, 1), '__file__': filename.replace('*', 'Reg01'), }, { 'IDX': '15', 'ROME': 'foo', 'DATINS': datetime.date(2015, 12, 1), '__file__': filename.replace('*', 'Reg01'), }, { 'IDX': '2', 'ROME': 'foo', 'DATINS': datetime.date(2015, 12, 1), '__file__': filename.replace('*', 'Reg21'), }, ]) if '/e0_' in filename: return iter([ { 'IDX': '1', 'HOURS': 42, 'MOIS': '201510', '__file__': filename.replace('*', 'Reg01'), }, { 'IDX': '1', 'HOURS': 43, 'MOIS': '201510', '__file__': filename.replace('*', 'Reg01'), }, { 'IDX': '2', 'HOURS': 27, 'MOIS': '201510', '__file__': filename.replace('*', 'Reg21'), }, ]) mock_flatten_iterator.side_effect = _flatten_iterator seekers = list( fhs.job_seeker_iterator('/folder/path/', tables=('de', 'e0'))) data = [j._data for j in seekers] # pylint: disable=protected-access self.assertEqual([ { 'IDX': '1', 'de': [{ 'IDX': '1', 'ROME': 'foo', 'DATINS': datetime.date(2015, 12, 1), '__file__': '/folder/path/Reg01/de_Reg01.csv', }], 'e0': [ { 'IDX': '1', 'HOURS': 42, 'MOIS': '201510', '__file__': '/folder/path/Reg01/e0_Reg01.csv', }, { 'IDX': '1', 'HOURS': 43, 'MOIS': '201510', '__file__': '/folder/path/Reg01/e0_Reg01.csv', }, ], }, { 'IDX': '15', 'de': [{ 'IDX': '15', 'ROME': 'foo', 'DATINS': datetime.date(2015, 12, 1), '__file__': '/folder/path/Reg01/de_Reg01.csv', }], 'e0': [], }, { 'IDX': '2', 'de': [{ 'IDX': '2', 'ROME': 'foo', 'DATINS': datetime.date(2015, 12, 1), '__file__': '/folder/path/Reg21/de_Reg21.csv', }], 'e0': [{ 'IDX': '2', 'HOURS': 27, 'MOIS': '201510', '__file__': '/folder/path/Reg21/e0_Reg21.csv', }], }, ], data)