def test_importer_filter_ids(self, mongo_mock): """Test of the filter_ids flag.""" def richer_importer_func(): """An importer with many outputs.""" return list({ '_id': 'foo-{:02d}'.format(i), 'value': i } for i in range(20)) mongo_mock.return_value = mock.MagicMock() mongo.importer_main(richer_importer_func, 'my-collection', ['foo', '--filter_ids', 'foo-.2'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertTrue(import_in_collection.called) call_args = import_in_collection.call_args[0] self.assertEqual([{ '_id': 'foo-02', 'value': 2 }, { '_id': 'foo-12', 'value': 12 }], call_args[0]) self.assertEqual('my-collection', call_args[1])
def test_importer_main_no_args(self) -> None: """Test the importer_main without args.""" with self.assertRaises(SystemExit): mongo.importer_main(_my_importer_func, 'my-collection', ['foo'], out=self.output)
def test_fail_on_diff_when_no_diff( self, mock_requests: requests_mock.Mocker) -> None: """Test of the fail_on_diff flag when there are no diffs.""" result = [{'dummy': 3, '_id': 'only-one'}] def import_func() -> list[dict[str, Any]]: """Foo.""" return result mock_requests.post('https://slack.example.com/webhook') mongo.importer_main(import_func, 'my-collection', [], out=self.output) self.assertEqual(1, mock_requests.call_count) mongo.importer_main(import_func, 'my-collection', ['--fail_on_diff'], out=self.output) self.assertEqual(2, mock_requests.call_count) self.assertIn( 'The data is already up to date.', mock_requests.request_history[1].json()['attachments'][0]['text']) self.assertEqual( 1, self.db_client.test['my-collection'].count_documents({})) value = self.db_client.test['my-collection'].find_one() assert value del value['_id'] self.assertEqual({'dummy': 3}, value)
def test_importer_main_no_args(self) -> None: """Test the importer_main without args.""" with self.assertRaises(argparse.ArgumentError): mongo.importer_main(_my_importer_func, 'my-collection', ['foo'], out=self.output)
def test_importer_main_no_args_but_default(self) -> None: """Test the importer_main without args but with default value.""" def import_func(arg1: str = 'default value') -> list[dict[str, Any]]: """Foo.""" return [{'dummy': 2, 'arg1': arg1}] mongo.importer_main(import_func, 'my-collection', [], out=self.output) value = self.db_client.test['my-collection'].find_one() assert value del value['_id'] self.assertEqual({'arg1': 'default value', 'dummy': 2}, value)
def test_importer_main(self, mongo_mock): """Test of basic usage of the importer_main function.""" mongo_mock.return_value = mock.MagicMock() mongo.importer_main(_my_importer_func, 'my-collection', ['foo', '--arg1', 'Value of arg1'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertTrue(import_in_collection.called) call_args = import_in_collection.call_args[0] self.assertEqual([{'arg1': 'Value of arg1', 'dummy': 2}], call_args[0]) self.assertEqual('my-collection', call_args[1])
def test_importer_main(self) -> None: """Test of basic usage of the importer_main function.""" mongo.importer_main(_my_importer_func, 'my-collection', ['--arg1', 'Value of arg1'], out=self.output) self.assertEqual(['meta', 'my-collection'], sorted(self.db_client.test.list_collection_names())) self.assertEqual( 1, self.db_client.test['my-collection'].count_documents({})) value = self.db_client.test['my-collection'].find_one() assert value del value['_id'] self.assertEqual({'arg1': 'Value of arg1', 'dummy': 2}, value)
def test_importer_main_no_args_but_default(self, mongo_mock): """Test the importer_main without args but with default value.""" def import_func(arg1='default value'): """Foo.""" return [{'dummy': 2, 'arg1': arg1}] mongo_mock.return_value = mock.MagicMock() mongo.importer_main(import_func, 'my-collection', ['foo'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertTrue(import_in_collection.called) call_args = import_in_collection.call_args[0] self.assertEqual([{'arg1': 'default value', 'dummy': 2}], call_args[0])
def test_importer_main_with_input_file(self) -> None: """Test that the import_func doesn't get called with an input file.""" def importer_func() -> list[dict[str, Any]]: # pragma: no-cover """Foo.""" self.fail('Should not be called') return [] testdata_dir = path.join(path.dirname(__file__), 'testdata') json_path = path.join(testdata_dir, 'import_dummy_data.json') mongo.importer_main(importer_func, 'my_collection', ['--from_json', json_path], out=self.output) self.assertEqual(1, len(list(self.db_client.test.my_collection.find())))
def test_importer_collection_name(self) -> None: """Test the importer_main getting the collection name.""" def import_func(collection_name: str) -> list[dict[str, Any]]: """Foo.""" return [{'dummy': 2, 'collection_name': collection_name}] mongo.importer_main(import_func, 'my-collection', ['--mongo_collection', 'cli-name'], out=self.output) value = self.db_client.test['cli-name'].find_one() assert value del value['_id'] self.assertEqual({'collection_name': 'cli-name', 'dummy': 2}, value)
def test_importer_main_with_output_file(self, mongo_mock): """Test that data gets written to file instead of DB when file given.""" out_path = tempfile.mktemp() mongo.importer_main( _my_importer_func, 'my-collection', ['', '--to_json', out_path, '--arg1', 'arg1 test value'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertFalse(import_in_collection.called) with open(out_path) as json_file: json_content = json_file.read() self.assertEqual([{ 'arg1': 'arg1 test value', 'dummy': 2 }], json.loads(json_content)) self.assertTrue(json_content.endswith('\n'))
def test_importer_filter_ids(self) -> None: """Test of the filter_ids flag.""" def richer_importer_func() -> list[dict[str, Any]]: """An importer with many outputs.""" return list({'_id': f'foo-{i:02d}', 'value': i} for i in range(20)) mongo.importer_main(richer_importer_func, 'my-collection', ['--filter_ids', 'foo-.2'], out=self.output) self.assertEqual([{ '_id': 'foo-02', 'value': 2 }, { '_id': 'foo-12', 'value': 12 }], list(self.db_client.test['my-collection'].find()))
def test_importer_main_with_input_file(self, pymongo_mock): """Test that the import_func doesn't get called with an input file.""" mock_importer_func = mock.MagicMock(spec=_my_importer_func) def importer_func(): """Foo.""" mock_importer_func() client = mongomock.MongoClient('mongodb://mongo-url/test') pymongo_mock.MongoClient.return_value = client testdata_dir = path.join(path.dirname(__file__), 'testdata') json_path = path.join(testdata_dir, 'import_dummy_data.json') mongo.importer_main(importer_func, 'my_collection', ['', '--from_json', json_path], flag_values=gflags.FlagValues()) self.assertFalse(mock_importer_func.called) self.assertEqual(1, len(list(client.test.my_collection.find())))
def test_importer_main_with_output_file(self) -> None: """Test that data gets written to file instead of DB when file given.""" out_path = tempfile.mktemp() mongo.importer_main( _my_importer_func, 'my-collection', ['--to_json', out_path, '--arg1', 'arg1 test value'], out=self.output) with open(out_path, encoding='utf-8') as json_file: json_content = json_file.read() self.assertEqual([{ 'arg1': 'arg1 test value', 'dummy': 2 }], json.loads(json_content)) self.assertTrue(json_content.endswith('\n')) self.assertEqual( 0, len(list(self.db_client.test['my-collection'].find())))
def test_fail_on_diff(self, mock_requests: requests_mock.Mocker) -> None: """Test of the fail_on_diff flag.""" result = [{'dummy': 3, '_id': 'only-one'}] def import_func() -> list[dict[str, Any]]: """Foo.""" return result mock_requests.post('https://slack.example.com/webhook') mongo.importer_main(import_func, 'my-collection', [], out=self.output) self.assertEqual(1, mock_requests.call_count) mock_requests.reset_mock() # type: ignore result[0]['dummy'] = 4 with self.assertRaises(ValueError): mongo.importer_main(import_func, 'my-collection', ['--fail_on_diff'], out=self.output) self.assertEqual(1, mock_requests.call_count) self.assertIn( 'There are some diffs to import.', mock_requests.request_history[0].json()['attachments'][0]['text']) self.assertEqual( 1, self.db_client.test['my-collection'].count_documents({})) value = self.db_client.test['my-collection'].find_one() assert value del value['_id'] self.assertEqual({'dummy': 3}, value, msg='Values should not have been updated')
def validate(values, proto_class): """Validate that the values have the right format. Args: values: an iterable of dict with the JSON values of proto. They may have an additional "_id" field that will be ignored. proto_class: the Python class of the proto that should be contained in the values. Returns: the input for chainability Raises: ValueError if one of the values doesn't have the right format. """ for value in values: proto = proto_class() _id = value.pop('_id', None) # Enforce Proto schema. try: json_format.Parse(json.dumps(value), proto) except json_format.ParseError as error: raise ValueError('Error while parsing:\n{}\n{}'.format( json.dumps(value, indent=2), error)) if _id is not None: value['_id'] = _id return values if __name__ == '__main__': mongo.importer_main(airtable2dicts, 'test') # pragma: no-cover
]] samples.rename(columns={ 'target_job': 'codeOgr', 'target_job_name': 'name', 'target_job_masculine_name': 'masculineName', 'target_job_feminine_name': 'feminineName', }, inplace=True) return { 'jobGroup': { 'romeId': jobs.target_job_group.iloc[0], 'name': jobs.target_job_group_name.iloc[0], 'samples': samples.to_dict('records'), } } def _sample_jobs(num_samples): def _sampling(jobs): if len(jobs.index) > num_samples: jobs = jobs.sample(n=num_samples) jobs = jobs[['codeOgr', 'name', 'masculineName', 'feminineName']] return jobs.to_dict('records') return _sampling if __name__ == '__main__': mongo.importer_main(csv2dicts, 'similar_jobs') # pragma: no cover
departement_missions = missions[~missions.isAvailableEverywhere]\ .groupby('departement').apply(_get_random_missions_picker(5)) returned_missions = country_wide_missions + [{ '_id': departement_id, 'missions': missions } for departement_id, missions in departement_missions.iteritems()] if not check_coverage(returned_missions): raise ValueError('The putative new data lacks coverage.') return returned_missions def _get_random_missions_picker(num_missions: int) \ -> Callable[[pd.DataFrame], List[Dict[str, Any]]]: def _pick_random_missions(missions: pd.DataFrame) -> List[Dict[str, Any]]: if len(missions) > num_missions: samples = missions.sample(num_missions) else: samples = missions return typing.cast( List[Dict[str, Any]], samples[['associationName', 'title', 'link', 'description']].to_dict('records')) return _pick_random_missions if __name__ == '__main__': mongo.importer_main(get_missions_dicts, 'volunteering_missions')
with open(events_file_name) as json_data: salons = typing.cast(List[Dict[str, Any]], json.load(json_data)) for salon in salons: salon['start_date'] = _isodate_from_string(salon['dateDebut']) salon['application_start_date'] = _isodate_from_string( salon['dateDebutCandidature']) salon['application_end_date'] = _isodate_from_string( salon['dateFinCandidature'], is_end_of_day=True) salon['locations'] = _get_city( french_regions_tsv, prefix_tsv, typing.cast(str, salon.get('localisation', ''))) salon = _aggregate_rule_results(salon, rules) if not salon['locations']: logging.warning('Missing locations on salon\n%s', salon) # TODO(cyrille): Add test for not missing case. if not salon.get('jobGroupIds'): logging.warning('Missing job groups on salon\n%s', salon) for old, new in _FIELD_RENAMER.items(): try: salon[new] = salon.pop(old) except KeyError: continue for field in _FIELDS_TO_DROP: salon.pop(field, None) return salons if __name__ == '__main__': mongo.importer_main(json2dicts, 'online_salons')
stats_filename: path to a file containing stats about cities. urban_context_filename: path to a file containing urban context info for each cities. Returns: A list of dict JSON-like object compatible with the geo_pb2.FrenchCity proto. """ city_stats = pandas.read_csv(stats_filename, sep=',', header=None, usecols=[10, 19, 20], names=['_id', 'longitude', 'latitude'], dtype={ '_id': str, 'latitude': float, 'longitude': float }) city_stats.dropna() urban_contexts = cleaned_data.french_urban_areas( filename=urban_context_filename) city_stats['urbanContext'] = city_stats['_id'].map(urban_contexts.periurban)\ .fillna(geo_pb2.UNKNOWN_URBAN_CONTEXT).astype(int) return typing.cast(List[Dict[str, Any]], city_stats.to_dict(orient='records')) if __name__ == '__main__': mongo.importer_main(csv2dicts, 'cities')
by_region['region_count'] = region_count city_count = by_region.reset_index() # Compute country counts for each city. country_count = recent_offers.groupby('rome_id').id_offre.count() by_country = city_count.set_index('rome_id') by_country['country_count'] = country_count city_count = by_country.reset_index() for row in city_count.itertuples(): res.append({ '_id': row.rome_id + ':c' + row.city_code, 'city': { 'cityId': row.city_code, 'name': row.city_name, 'departementId': row.departement_code, 'departementName': row.departement_name, 'regionId': row.region_code, 'regionName': row.region_name, }, 'cityCount': int(row.city_count), 'departementCount': int(row.departement_count), 'regionCount': int(row.region_count), 'countryCount': int(row.country_count), }) return res if __name__ == '__main__': mongo.importer_main(csv2dicts, 'job_offers') # pragma: no cover
job_requirement.pop('_id'): job_requirement for job_requirement in job_requirements_list } job_groups['requirements'] = job_groups.index.map(job_requirements_dict) # Replace NaN by empty dicts. job_groups['requirements'] = job_groups.requirements.apply( lambda r: r if isinstance(r, dict) else {}) # SkillsForFuture skills_for_future_by_rome = airtable_to_protos.load_items_from_prefix( 'Skill', job_groups.index, skills_for_future_airtable, 'soc_prefixes_us') if skills_for_future_by_rome: with translation.Translator() as translator: translated_skills_for_future_by_rome = { rome_id: [ skill | translator.ensure_translate_fields( skill, locale='en', fields=_SKILL_18N_FIELDS) for skill in skills ] for rome_id, skills in skills_for_future_by_rome.items() } job_groups['skillsForFuture'] = job_groups.index.map( translated_skills_for_future_by_rome) return typing.cast(list[dict[str, Any]], job_groups.to_dict('records')) if __name__ == '__main__': mongo.importer_main(make_dicts, 'job_group_info')
API_KEY = os.getenv('AIRTABLE_API_KEY') def airtable2dicts(base_id: str, table: str, view: Optional[str] = None) -> List[Dict[str, Any]]: """Import the users email from Airtable. Args: base_id: the ID of your Airtable app. table: the name of the table to import. view: optional - the name of the view to import. Returns: an iterable of dict with the JSON values of the proto. """ if not API_KEY: raise ValueError( 'No API key found. Create an airtable API key at ' 'https://airtable.com/account and set it in the AIRTABLE_API_KEY ' 'env var.') client = airtable.Airtable(base_id, API_KEY) records = client.iterate(table, view=view) return [{'_id': r.get('fields', {}).get('email', '')} for r in records] if __name__ == '__main__': mongo.importer_main(airtable2dicts, 'show_unverified_data_users')
}, ] else: country_wide_missions = [] # TODO(pascal): Add some missions per city as well. departement_missions = missions[~missions.isAvailableEverywhere]\ .groupby('departement').apply(_get_random_missions_picker(5)) return country_wide_missions + [{ '_id': departement_id, 'missions': missions } for departement_id, missions in departement_missions.iteritems()] def _get_random_missions_picker(num_missions): def _pick_random_missions(missions): if len(missions) > num_missions: samples = missions.sample(num_missions) else: samples = missions return samples[['associationName', 'title', 'link', 'description']].to_dict('records') return _pick_random_missions if __name__ == '__main__': mongo.importer_main(get_missions_dicts, 'volunteering_missions') # pragma: no-cover
return f'{year:04d}-{month:02d}-{day:02d}' def _adie_event_to_proto(props: Dict[str, Any]) -> Dict[str, Any]: props['cityName'] = props['ville'].title() return { '_id': props['rdvGroupeId'], 'cityName': props['cityName'], 'description': '***Ça parle de quoi ?***\n\n' '{sousTitre}\n\n' '***Ça se passe où ?***\n\n' '{nomSite}\n' '{adresse1}, {adresse2}, {codePostal} {cityName}\n\n' '***Quand ?***\n\n' 'le {date}\n'.format(**props), 'latitude': props['latitude'], 'longitude': props['longitude'], 'timingText': f'le {" ".join(props["date"].split(" ")[1:3])}', 'startDate': _parse_date(_drop_first_word(props['date'])), 'title': props['titre'], } def _drop_first_word(text: str) -> str: return ' '.join(text.split(' ')[1:]) if __name__ == '__main__': mongo.importer_main(adie_events2dicts, 'adie_events')
grouping=True) to_salary = locale.format_string('%d', estimation['maxSalary'], grouping=True) estimation['shortText'] = f'{from_salary} - {to_salary}' estimation['unit'] = 'ANNUAL_GROSS_SALARY' return estimation def _get_training_count(trainings_csv: str) -> pandas.Series: trainings = pandas.read_csv(trainings_csv, dtype={'address.postalCode': str}) # Fix short postal codes. short_postal_codes = trainings['address.postalCode'].str.len() == 4 trainings.loc[short_postal_codes, 'address.postalCode'] = \ '0' + trainings.loc[short_postal_codes, 'address.postalCode'] # Extract deparatement ID trainings['departement_id'] = trainings['address.postalCode'].str[:2] oversee_departement = trainings.departement_id == '97' trainings.loc[oversee_departement, 'departement_id'] = \ trainings.loc[oversee_departement, 'address.postalCode'].str[:3] # Create local_id. trainings['local_id'] = trainings['departement_id'] + ':' +\ trainings['formation.proximiteRomes.code'] return trainings.dropna(subset=['local_id']).groupby('local_id').size()\ .rename('trainingCount').reset_index() if __name__ == '__main__': mongo.importer_main(csv2dicts, 'local_diagnosis')
fap_growth['growth_2012_2022'] = \ fap_growth.num_job_creations_2012_2022.div(fap_growth.num_jobs_2012) rome_fap_mapping = cleaned_data.rome_fap_mapping( filename=rome_fap_crosswalk_txt) rome_fap_flat_mapping = pandas.melt( rome_fap_mapping.fap_codes.apply(lambda s: pandas.Series(list(s))).reset_index(), id_vars=['index']).set_index('index').value.dropna().to_frame('fap_qualified_code') rome_fap_flat_mapping['fap_code'] = rome_fap_flat_mapping.fap_qualified_code.str[:3] multi_fap_groups = { 'D0Z-D3Z': {'D0Z', 'D3Z'}, 'F0Z-F1Z': {'F0Z', 'F1Z'}, 'F2Z-F3Z': {'F2Z', 'F3Z'}, } for fap_codes, fap_codes_as_set in multi_fap_groups.items(): rome_fap_flat_mapping.loc[ rome_fap_flat_mapping.fap_code.isin(fap_codes_as_set), 'fap_code'] = fap_codes rome_fap_flat_mapping.drop( rome_fap_flat_mapping[rome_fap_flat_mapping.fap_code == 'K0Z'].index, inplace=True) rome_fap_flat_mapping['growth_2012_2022'] = \ rome_fap_flat_mapping.fap_code.map(fap_growth.set_index('fap_codes').growth_2012_2022) rome_fap_flat_mapping['num_jobs_2012'] = \ rome_fap_flat_mapping.fap_code.map(fap_growth.set_index('fap_codes').num_jobs_2012) return rome_fap_flat_mapping.groupby(level=0).apply( lambda faps: 0 if faps.num_jobs_2012.sum() == 0 else faps.growth_2012_2022.mul(faps.num_jobs_2012).sum() / faps.num_jobs_2012.sum()) if __name__ == '__main__': mongo.importer_main(make_dicts, 'job_group_info') # pragma: no cover
'target_job': 'codeOgr', 'target_job_name': 'name', 'target_job_masculine_name': 'masculineName', 'target_job_feminine_name': 'feminineName', }, inplace=True) return { 'jobGroup': { 'romeId': jobs.target_job_group.iloc[0], 'name': jobs.target_job_group_name.iloc[0], 'samples': samples.to_dict('records'), } } def _sample_jobs( num_samples: int ) -> Callable[[pandas.DataFrame], list[dict[str, Any]]]: def _sampling(jobs: pandas.DataFrame) -> list[dict[str, Any]]: if len(jobs.index) > num_samples: jobs = jobs.sample(n=num_samples) jobs = jobs[['codeOgr', 'name', 'masculineName', 'feminineName']] return typing.cast(list[dict[str, Any]], jobs.to_dict('records')) return _sampling if __name__ == '__main__': mongo.importer_main(csv2dicts, 'similar_jobs')
by_region['region_count'] = region_count city_count = by_region.reset_index() # Compute country counts for each city. country_count = recent_offers.groupby('rome_id').id_offre.count() by_country = city_count.set_index('rome_id') by_country['country_count'] = country_count city_count = by_country.reset_index() for row in city_count.itertuples(): res.append({ '_id': row.rome_id + ':c' + row.city_code, 'city': { 'cityId': row.city_code, 'name': row.city_name, 'departementId': row.departement_code, 'departementName': row.departement_name, 'regionId': row.region_code, 'regionName': row.region_name, }, 'cityCount': int(row.city_count), 'departementCount': int(row.departement_count), 'regionCount': int(row.region_count), 'countryCount': int(row.country_count), }) return res if __name__ == '__main__': mongo.importer_main(csv2dicts, 'job_offers')
Args: durations_csv: path to a CSV file containing one line for each job seeker, some of their properties and the duration of their last unemployment period. See the full doc in the `fhs_category_duration.py` script. Returns: A list of dict compatible with the JSON version of TODO: Add proto here with an additional unique "_id" field. """ job_seekers = pandas.read_csv(durations_csv, dtype={'city_id': str}) global_diagnoses = [] for rome_id, group in job_seekers.groupby('code_rome'): estimation = importer_helpers.unemployment_estimation(group.duration) global_diagnoses.append({ '_id': rome_id, 'unemploymentTimeHistogram': _get_histogram(group.duration), 'diagnosis': estimation, }) return global_diagnoses if __name__ == '__main__': mongo.importer_main(fhs2dicts, 'global_diagnosis')