def test_importer_main(self, mongo_mock): """Test of basic usage of the importer_main function.""" mongo_mock.return_value = mock.MagicMock() mongo.importer_main(_my_importer_func, 'my-collection', ['foo', '--arg1', 'Value of arg1'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertTrue(import_in_collection.called) call_args = import_in_collection.call_args[0] self.assertEqual([{'arg1': 'Value of arg1', 'dummy': 2}], call_args[0]) self.assertEqual('my-collection', call_args[1])
def test_importer_main_no_args_but_default(self, mongo_mock): """Test the importer_main without args but with default value.""" def import_func(arg1='default value'): """Foo.""" return [{'dummy': 2, 'arg1': arg1}] mongo_mock.return_value = mock.MagicMock() mongo.importer_main(import_func, 'my-collection', ['foo'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertTrue(import_in_collection.called) call_args = import_in_collection.call_args[0] self.assertEqual([{'arg1': 'default value', 'dummy': 2}], call_args[0])
def test_importer_main_with_output_file(self, mongo_mock): """Test that data gets written to file instead of DB when file given.""" out_path = tempfile.mktemp() mongo.importer_main( _my_importer_func, 'my-collection', ['', '--to_json', out_path, '--arg1', 'arg1 test value'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertFalse(import_in_collection.called) with open(out_path) as json_file: json_content = json_file.read() self.assertEqual( [{'arg1': 'arg1 test value', 'dummy': 2}], json.loads(json_content)) self.assertTrue(json_content.endswith('\n'))
def test_importer_main_with_input_file(self, pymongo_mock): """Test that the import_func doesn't get called with an input file.""" mock_importer_func = mock.MagicMock(spec=_my_importer_func) def importer_func(): """Foo.""" mock_importer_func() client = mongomock.MongoClient('mongodb://mongo-url/test') pymongo_mock.MongoClient.return_value = client testdata_dir = path.join(path.dirname(__file__), 'testdata') json_path = path.join(testdata_dir, 'import_dummy_data.json') mongo.importer_main(importer_func, 'my_collection', ['', '--from_json', json_path], flag_values=gflags.FlagValues()) self.assertFalse(mock_importer_func.called) self.assertEqual(1, len(list(client.test.my_collection.find())))
def test_importer_filter_ids(self, mongo_mock): """Test of the filter_ids flag.""" def richer_importer_func(): """An importer with many outputs.""" return list({'_id': 'foo-%02d' % i, 'value': i} for i in range(20)) mongo_mock.return_value = mock.MagicMock() mongo.importer_main( richer_importer_func, 'my-collection', ['foo', '--filter_ids', 'foo-.2'], flag_values=gflags.FlagValues()) import_in_collection = mongo_mock.return_value.import_in_collection self.assertTrue(import_in_collection.called) call_args = import_in_collection.call_args[0] self.assertEqual( [{'_id': 'foo-02', 'value': 2}, {'_id': 'foo-12', 'value': 12}], call_args[0]) self.assertEqual('my-collection', call_args[1])
# Filter Job Groups on having more than 10 offers per departement month. top_departements_per_month = \ top_departements_per_month[top_departements_per_month.offers > 10] # Adding the job groups inside. def _create_jobgroups(jobs): return jobs[['name', 'romeId', 'offers']].to_dict(orient='records') romes_per_dep_month = top_departements_per_month.groupby( ['departementId', 'creationMonth', 'departementSeasonalOffers'])\ .apply(_create_jobgroups)\ .to_frame('jobGroups')\ .reset_index()\ .rename(columns={'creationMonth': '_id'}) def _create_month_stats(jobs): return jobs[['departementId', 'jobGroups', 'departementSeasonalOffers']]\ .to_dict(orient='records') monthly_data = romes_per_dep_month\ .groupby('_id')\ .apply(_create_month_stats)\ .to_frame('departementStats')\ .reset_index() return monthly_data.to_dict(orient='records') if __name__ == '__main__': mongo.importer_main(csv2dicts, 'seasonal_jobbing') # pragma: no cover
missions['isAvailableEverywhere'] = missions.JobId.map(all_post_codes) == _EVERYWHERE_POSTCODES if sum(missions.isAvailableEverywhere): everywhere_missions = missions[missions.isAvailableEverywhere].drop_duplicates('JobId') country_wide_missions = [ {'_id': '', 'missions': _get_random_missions_picker(5)(everywhere_missions)}, ] else: country_wide_missions = [] # TODO(pascal): Add some missions per city as well. departement_missions = missions[~missions.isAvailableEverywhere]\ .groupby('departement').apply(_get_random_missions_picker(5)) return country_wide_missions + [ {'_id': departement_id, 'missions': missions} for departement_id, missions in departement_missions.iteritems()] def _get_random_missions_picker(num_missions): def _pick_random_missions(missions): if len(missions) > num_missions: samples = missions.sample(num_missions) else: samples = missions return samples[['associationName', 'title', 'link', 'description']].to_dict('records') return _pick_random_missions if __name__ == '__main__': mongo.importer_main(get_missions_dicts, 'volunteering_missions') # pragma: no-cover
def download_and_count(): """Import the # of job offers available per job group and dept in MongoDB. Returns: Recent job offers count as a LocalJobStats JSON-proto compatible dict. """ counts = collections.defaultdict(int) for job_offer in _iterate_job_offers(): local_id = '%s:%s' % (job_offer['DEPARTEMENT_CODE'], job_offer['ROME_PROFESSION_CARD_CODE']) counts[local_id] += 1 return [{ '_id': local_id, 'numAvailableJobOffers': count } for local_id, count in counts.items()] def _iterate_job_offers(): client = emploi_store.Client( client_id=os.getenv('EMPLOI_STORE_CLIENT_ID'), client_secret=os.getenv('EMPLOI_STORE_CLIENT_SECRET')) package = client.get_package('offres') resource = package.get_resource(name="Offres d'emploi") return resource.records( fields=['DEPARTEMENT_CODE', 'ROME_PROFESSION_CARD_CODE']) if __name__ == '__main__': mongo.importer_main( # pragma: no-cover download_and_count, 'recent_job_offers')
def validate(values, proto_class): """Validate that the values have the right format. Args: values: an iterable of dict with the JSON values of proto. They may have an additional "_id" field that will be ignored. proto_class: the Python class of the proto that should be contained in the values. Returns: the input for chainability Raises: ValueError if one of the values doesn't have the right format. """ for value in values: proto = proto_class() _id = value.pop('_id', None) # Enforce Proto schema. try: json_format.Parse(json.dumps(value), proto) except json_format.ParseError as error: raise ValueError('Error while parsing:\n%s\n%s' % (json.dumps(value, indent=2), error)) if _id is not None: value['_id'] = _id return values if __name__ == '__main__': mongo.importer_main(airtable2dicts, 'test') # pragma: no-cover
file3 = pandas.read_csv(file3_path) file3.columns = ['postcodes', 'postcode'] city_stats = cleaned_data.french_city_stats(data_folder) city_stats = city_stats[~city_stats.city_id. isin(['13055', '75056', '69123'])] postcode_to_range_mapping = {} for zip_codes in city_stats.zipCode: for zip_code in zip_codes.split('-'): postcode_to_range_mapping[zip_code] = zip_codes job_groups = cleaned_data.rome_job_groups(data_folder) rome_ids = job_groups.reset_index() rome_ids['merge_id'] = 1 rome_ids = rome_ids[['code_rome', 'merge_id']] rome_ids.columns = ['rome_id', 'merge_id'] file3['merge_id'] = 1 outer_product = pandas.merge(file3, rome_ids, how='outer', on=['merge_id']) massaged_file3 = outer_product[['rome_id', 'postcode']] data_zones = pandas.concat([data_zones, massaged_file3]) padded_postcodes = data_zones.postcode.astype(str).str.pad(5, 'left', '0') data_zones['postcodes'] = padded_postcodes.map(postcode_to_range_mapping) data_zones.drop_duplicates(['rome_id', 'postcodes'], inplace=True) return data_zones if __name__ == "__main__": mongo.importer_main(csv2dicts, 'unverified_data_zones') # pragma: no cover
for proto_name, airtable_name in _AIRTABLE_ASSET_TO_PROTO_FIELD.items(): value = airtable_fields.get(airtable_name) if value: try: assets[proto_name] = _assert_markdown_list(value) except ValueError as error: errors.append( ValueError('The field %s is not formatted correctly: %s' % (airtable_name, error))) if errors: raise ValueError('The job %s has %d, errors:\n%s' % (airtable_fields.get('code_rome'), len(errors), '\n'.join(str(error) for error in errors))) return airtable_fields['code_rome'], assets def _assert_markdown_list(value): lines = value.strip().split('\n') if not lines: return '' for line in lines: if not _MARKDOWN_LIST_LINE_REGEXP.match(line): raise ValueError( 'Each line should start with a * and an upper case, found: %s' % line) return '\n'.join(lines) if __name__ == "__main__": mongo.importer_main(make_dicts, 'job_group_info') # pragma: no cover
- departement_id: the ID of the département or None if the group covers multiple départements. - region_id: the ID of the région or None if the group covers multiple régions. The DataFrame has only one row indexed with <city_id>:<code_rome>. """ if len(job_seekers) < _MINIMUM_GROUP_SIZE: return None estimation = { 'days': int(job_seekers.duration.median()), } departement_ids = job_seekers.departement_id.unique() region_ids = job_seekers.region_id.unique() code_rome = job_seekers.iloc[0]['code_rome'] group_index = job_seekers.iloc[0]['city_id'] + ':' + code_rome return pandas.DataFrame( { 'city_id': [job_seekers.iloc[0]['city_id']], 'city_name': [job_seekers.iloc[0]['city_name']], 'code_rome': [code_rome], 'duration': [estimation], 'departement_id': [departement_ids[0] if len(departement_ids) == 1 else None], 'region_id': [region_ids[0] if len(region_ids) == 1 else None], }, index=[group_index]) if __name__ == "__main__": mongo.importer_main(fhs2dicts, 'fhs_local_diagnosis') # pragma: no cover
def _workup_to_proto(event, departements): if event.get('address', '').strip().lower() == 'en ligne': geo_filters = [] else: close_departements = departements[ (departements.max_latitude + _LAT_BUFFER >= event['latitude']) & (departements.min_latitude - _LAT_BUFFER <= event['latitude']) & (departements.max_longitude + _LNG_BUFFER >= event['longitude']) & (departements.min_longitude - _LNG_BUFFER <= event['longitude'])] if close_departements.empty: raise ValueError('Event is next to no French départements:\n%s', event) geo_filters = [ 'for-departement(%s)' % ','.join(sorted(close_departements.departement_id)) ] # TODO(pascal): Add better filters for reorientation. return { '_id': event['id'], 'filters': geo_filters, 'link': _WORKUP_EVENT_URL % event['slug'], 'organiser': event['organiser'], 'startDate': event['date'], 'title': event['title'], } if __name__ == '__main__': mongo.importer_main(events2dicts, 'events') # pragma: no-cover
from bob_emploi.lib import mongo def csv2dicts(stats_filename): """Prepare cities for upload to MongoDB. Args: stats_filename: path to a file containing stats about cities. Returns: A list of dict JSON-like object compatible with the geo_pb2.FrenchCity proto. """ city_stats = pandas.read_csv(stats_filename, sep=',', header=None, usecols=[10, 19, 20], names=['_id', 'longitude', 'latitude'], dtype={ '_id': str, 'latitude': float, 'longitude': float }) city_stats.dropna() return city_stats.to_dict(orient='records') if __name__ == '__main__': mongo.importer_main(csv2dicts, 'cities') # pragma: no-cover
} def _update_salaries(imt): old_imt = imt.loc['imt'] new_salaries = imt.loc['updated_salaries'] if not isinstance(new_salaries, dict): return old_imt return dict(old_imt, **new_salaries) def finalize_salary_estimation(estimation): """Finalize the data for a SalaryEstimation proto. Args: estimation: a dict with min/max/medianSalary. This dict will be modified. Returns: The input dict with additional fields to be displayed. """ estimation['shortText'] = '{} - {}'.format( locale.format('%d', estimation['minSalary'], grouping=True), locale.format('%d', estimation['maxSalary'], grouping=True)) estimation['unit'] = 'ANNUAL_GROSS_SALARY' return estimation if __name__ == '__main__': mongo.importer_main(csv2dicts, 'local_diagnosis') # pragma: no cover
def _get_jobs(self, count_threshold): jobs = self._get_sorted_requirements(_RequirementKind.job, count_threshold) for job_id, count, unused_percent_required in jobs: yield { 'percentSuggested': round(100 * count / self.num_offers), 'codeOgr': job_id, } def csv2dicts(job_offers_csv, colnames_txt): """Import the requirement from job offers grouped by Job Group in MongoDB. Args: job_offers_csv: Path of the csv containing the data. colnames_txt: Path to a file containing the name of the CSV's columns. Returns: Requirements as a JobRequirements JSON-proto compatible dict. """ job_groups = collections.defaultdict(_RequirementsCollector) for job_offer in job_offers.iterate( job_offers_csv, colnames_txt, _REQUIRED_FIELDS): job_groups[job_offer.rome_profession_card_code].collect(job_offer) return [ dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id) for job_group_id in sorted(job_groups)] if __name__ == "__main__": mongo.importer_main(csv2dicts, 'job_requirements') # pragma: no cover
required_fields = { _CITY_CODE_FIELD, _JOB_GROUP_CODE_FIELD, _LATITUDE_CODE_FIELD, _LONGITUDE_CODE_FIELD, _CITY_NAME_CODE_FIELD} offers_rows = job_offers.iterate(offers_file, colnames, required_fields) city_data = _list_hiring_cities(offers_rows, min_creation_date, data_folder) # Computing the threshold per job group. job_group_threshold = collections.defaultdict(float) for job_group, offers in city_data.offers_per_job_group.items(): job_group_threshold[job_group] = math.pow(offers, 0.6) / 40 job_group_to_kept_cities = collections.defaultdict(list) for job_group, city_ids in city_data.job_group_to_city_ids.items(): kept_cities = [] for city_id, offer_count in city_ids.items(): if offer_count > job_group_threshold[job_group]: kept_cities.append({'city': city_data.city_info[city_id], 'offers': offer_count}) job_group_to_kept_cities[job_group] = sorted( kept_cities, key=lambda k: k['offers'], reverse=True) return [ {'_id': job_group_id, 'hiringCities': job_group_weighted_cities} for job_group_id, job_group_weighted_cities in job_group_to_kept_cities.items()] if __name__ == '__main__': mongo.importer_main(extract_offers_per_cities, 'hiring_cities') # pragma: no cover
by_region = city_count.set_index(group_cols) by_region['region_count'] = region_count city_count = by_region.reset_index() # Compute country counts for each city. country_count = recent_offers.groupby('rome_id').id_offre.count() by_country = city_count.set_index('rome_id') by_country['country_count'] = country_count city_count = by_country.reset_index() for row in city_count.itertuples(): res.append({ '_id': row.rome_id + ':c' + row.city_code, 'city': { 'cityId': row.city_code, 'name': row.city_name, 'departementId': row.departement_code, 'departementName': row.departement_name, 'regionId': row.region_code, 'regionName': row.region_name, }, 'cityCount': int(row.city_count), 'departementCount': int(row.departement_count), 'regionCount': int(row.region_count), 'countryCount': int(row.country_count), }) return res if __name__ == "__main__": mongo.importer_main(csv2dicts, 'job_offers') # pragma: no cover
"""Importer for e-Territoire URLs into MongoDB.""" import requests from bob_emploi.lib import mongo def get_cities_dicts(): """Download e-Territoire URLs from their website and prepare them. Returns: For each city (by INSEE ID) a deep link URL. """ response = requests.get('http://www.eterritoire.fr/webservice/listeCommunes.php') response.raise_for_status() urls = response.json() return [{'_id': u['idinsee'], 'path': u['url']} for u in urls] if __name__ == '__main__': mongo.importer_main(get_cities_dicts, 'eterritoire_links') # pragma: no-cover
]] samples.rename(columns={ 'target_job': 'codeOgr', 'target_job_name': 'name', 'target_job_masculine_name': 'masculineName', 'target_job_feminine_name': 'feminineName', }, inplace=True) return { 'jobGroup': { 'romeId': jobs.target_job_group.iloc[0], 'name': jobs.target_job_group_name.iloc[0], 'samples': samples.to_dict('records'), } } def _sample_jobs(num_samples): def _sampling(jobs): if len(jobs.index) > num_samples: jobs = jobs.sample(n=num_samples) jobs = jobs[['codeOgr', 'name', 'masculineName', 'feminineName']] return jobs.to_dict('records') return _sampling if __name__ == '__main__': mongo.importer_main(csv2dicts, 'similar_jobs') # pragma: no cover
from bob_emploi.lib import mongo API_KEY = os.getenv('AIRTABLE_API_KEY') def airtable2dicts(base_id, table, view=None): """Import the users email from Airtable. Args: base_id: the ID of your Airtable app. table: the name of the table to import. view: optional - the name of the view to import. Returns: an iterable of dict with the JSON values of the proto. """ if not API_KEY: raise ValueError( 'No API key found. Create an airtable API key at ' 'https://airtable.com/account and set it in the AIRTABLE_API_KEY ' 'env var.') client = airtable.Airtable(base_id, API_KEY) records = client.iterate(table, view=view) return [{'_id': r.get('fields', {}).get('email', '')} for r in records] if __name__ == '__main__': mongo.importer_main(airtable2dicts, 'show_unverified_data_users') # pragma: no-cover