def csv2dicts(rome_csv_pattern): """Import the ROME mobility data in MongoDB. We group the mobility data as JobGroups (we find a set of similar jobs either for a specific job or for a job group). To get all mobility data for a given job, you have to to look both for the data for this job (keyed by OGR code) and for the data for its job group (keyed by ROME code). As OGR code and ROME code use different namespaces there's no conflict to use it directly with its key. Args: rome_csv_pattern: pattern of paths to CSV files containing the ROME data. It must contain a '{}' that will be replaced by 'referentiel_code_rome', 'rubrique_mobilite' and 'referentiel_appellation'. """ mobility = pandas.read_csv(rome_csv_pattern.format('rubrique_mobilite'), dtype=str) job_groups = cleaned_data.rome_job_groups( filename=rome_csv_pattern.format('referentiel_code_rome')) jobs = cleaned_data.rome_jobs( filename=rome_csv_pattern.format('referentiel_appellation')) jobs.index.name = 'codeOgr' masculine_job_names, feminine_job_names = (rome_genderization.genderize( jobs.name)) jobs['masculineName'] = masculine_job_names jobs['feminineName'] = feminine_job_names jobs_names = jobs.name jobs.reset_index(inplace=True) jobs_samples = jobs.groupby('code_rome').apply(_sample_jobs(3)) mobility.rename(columns={ 'code_rome': 'source_job_group', 'code_appellation_source': 'source_job', 'code_rome_cible': 'target_job_group', 'code_appellation_cible': 'target_job', 'code_type_mobilite': 'mobility_type', }, inplace=True) mobility['target_job_group_name'] = (mobility.target_job_group.map( job_groups.name)) mobility.target_job_group_name.fillna('', inplace=True) mobility['target_job_group_samples'] = ( mobility.target_job_group.map(jobs_samples).fillna(False)) mobility['target_job_name'] = mobility.target_job.map(jobs_names) mobility.target_job_name.fillna('', inplace=True) mobility['target_job_masculine_name'] = mobility.target_job.map( masculine_job_names) mobility.target_job_masculine_name.fillna('', inplace=True) mobility['target_job_feminine_name'] = mobility.target_job.map( feminine_job_names) mobility.target_job_feminine_name.fillna('', inplace=True) return dataframe2dicts(mobility)
def main(rome_appellation_csv: str, output_txt: str) -> None: """Sample ROME jobs in job groups. Args: rome_appellation_csv: path to a CSV file containing all ROME jobs. output_txt: path where to create the output txt file. It will get populated with a list of masculine job names, one per line. """ jobs = cleaned_data.rome_jobs(filename=rome_appellation_csv) samples = jobs.groupby('code_rome').apply(lambda d: d.sample(1)) names, unused_ = rome_genderization.genderize(samples.name) with open(output_txt, 'w') as output: output.write('\n'.join(names.tolist()) + '\n')
def json_to_dicts(data_folder: str, json_appellation: str) -> List['_JobSuggest']: """Transform the incoming JSON list to an Algolia-ready list of job suggestions.""" # Read appellations from JSON. appellations = pandas.read_json(json_appellation).rename( columns=_RENAME_COLUMN_NAMES) # Add missing accents. _add_accents(appellations, ('jobGroupName', 'jobName')) # Clean non-genderized tags. _drop_regex(appellations, 'jobName', _BIGENDER_SUFFIX_REGEX) # Genderize names. _genderize(appellations, 'jobName', suffixes=('Masculine', 'Feminine')) # Join Dutch and French names. appellations['codeCompetent'] = appellations['codeCompetent']. \ apply(lambda code: code.split('-', 1)[1]) appellations['codeCompetent'].replace(_TRANSLATION_PAIRING, inplace=True) translated = appellations[appellations.lang == 'fr'].merge( appellations[appellations.lang == 'nl'], on=['codeCompetent', 'romeId', 'extendedRomeId'], suffixes=('Fr', 'Nl')) translated.drop(columns=['langFr', 'langNl'], inplace=True) # Set Competent code as object ID. translated.rename(columns={'codeCompetent': 'objectID'}, inplace=True) # Join with ROME jobs, when available. rome_jobs = cleaned_data.rome_jobs(data_folder).reset_index() _genderize(rome_jobs, 'name') rome_jobs = rome_jobs[['code_ogr', 'name_masculin']] rome_jobs.rename(columns={ 'code_ogr': 'codeOgr', 'name_masculin': 'jobNameMasculineFr', }, inplace=True) suggestions = translated.merge(rome_jobs, how='left', on='jobNameMasculineFr') # Convert from pandas.DataFrame to Python list of dicts. records = suggestions.to_dict(orient='records') return [ typing.cast('_JobSuggest', {k: v for k, v in record.items() if not pandas.isnull(v)}) for record in records ]
def make_dicts( rome_csv_pattern, job_requirements_json, job_application_complexity_json, application_mode_csv, rome_fap_crosswalk_txt, handcrafted_assets_airtable, domains_airtable, info_by_prefix_airtable, fap_growth_2012_2022_csv): """Import job info in MongoDB. Args: rome_csv_pattern: pattern of paths to CSV file containing the ROME data. It must contain a '{}' that will be replaced by 'referentiel_code_rome', 'referentiel_env_travail', 'liens_rome_referentiels' and 'referentiel_appellation'. job_requirements_json: path to a JSON file containing requirements per job group. job_application_complexity_json: path to a JSON file containing the application complexity of each job group. application_mode_csv: path to a CSV file containing the application mode data from emploi-store-dev API. rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk from FAP codes to ROME job group codes. handcrafted_assets_airtable: the base ID and the table named joined by a ':' of the AirTable containing the advice per job group (short texts describing assets required). domains_airtable: the base ID and the table name joined by a ':' of the AirTable containing the domain name for each sector. info_by_prefix_airtable: the base ID and the table name joined by a ':' of the AirTable containing some manually specified info for group of job group (by ROME ID prefix). fap_growth_2012_2022_csv: path to a CSV file containing the growth of FAP job groups for the period 2012-2022. Returns: A list of dict that maps the JSON representation of JobGroup protos. """ job_groups = cleaned_data.rome_job_groups( filename=rome_csv_pattern.format('referentiel_code_rome')) jobs = cleaned_data.rome_jobs( filename=rome_csv_pattern.format('referentiel_appellation')) holland_codes = cleaned_data.rome_holland_codes( filename=rome_csv_pattern.format('referentiel_code_rome_riasec')) rome_texts = cleaned_data.rome_texts( filename=rome_csv_pattern.format('texte')) rome_work_environments = cleaned_data.rome_work_environments( links_filename=rome_csv_pattern.format('liens_rome_referentiels'), ref_filename=rome_csv_pattern.format('referentiel_env_travail')) handcrafted_assets = _load_assets_from_airtable(*handcrafted_assets_airtable.split(':')) sector_domains = _load_domains_from_airtable(*domains_airtable.split(':')) info_by_prefix = _load_prefix_info_from_airtable( job_groups.index, *info_by_prefix_airtable.split(':')) application_modes = _get_application_modes( application_mode_csv, rome_fap_crosswalk_txt) fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv) # Genderize names. masculine, feminine = rome_genderization.genderize(jobs.name) jobs['masculineName'] = masculine jobs['feminineName'] = feminine # List jobs and pick samples. jobs.index.name = 'codeOgr' jobs.reset_index(inplace=True) jobs_grouped = jobs.groupby('code_rome') job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3)) job_groups['samples'] = job_groups.samples.apply( lambda s: s if isinstance(s, list) else []) job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None)) job_groups['jobs'] = job_groups.jobs.apply( lambda s: s if isinstance(s, list) else []) # Add info by prefix. job_groups = job_groups.join(info_by_prefix) # Add skills. rome_to_skills = cleaned_data.rome_to_skills( filename_items=rome_csv_pattern.format('coherence_item'), filename_skills=rome_csv_pattern.format('referentiel_competence')) skills_grouped = rome_to_skills.groupby('code_rome') job_groups['requirements'] = skills_grouped.apply( _group_skills_as_proto_list) # Replace NaN by empty dicts. job_groups['requirements'] = job_groups.requirements.apply( lambda r: r if isinstance(r, dict) else {}) # Combine requirements from json file. with open(job_requirements_json) as job_requirements_file: job_requirements_list = json.load(job_requirements_file) job_requirements_dict = { job_requirement.pop('_id'): job_requirement for job_requirement in job_requirements_list} for job_group in job_groups.itertuples(): job_group.requirements.update( job_requirements_dict.get(job_group.Index, {})) # Combine requirements from AirTable. for job_group in job_groups.itertuples(): job_group.requirements.update(handcrafted_assets.get(job_group.Index, {})) application_complexity = pandas.read_json(job_application_complexity_json) application_complexity.set_index('_id', inplace=True) job_groups['applicationComplexity'] = application_complexity['applicationComplexity'] job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True) # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes. # Will later be used for job similarity measures. job_groups['hollandCodeMajor'] = holland_codes.major job_groups.hollandCodeMajor.fillna('', inplace=True) job_groups['hollandCodeMinor'] = holland_codes.minor job_groups.hollandCodeMinor.fillna('', inplace=True) # Add description, working environment and requirement as text. job_groups['description'] = rome_texts.definition job_groups.description.fillna('', inplace=True) job_groups['workingEnvironment'] = rome_texts.working_environment job_groups.workingEnvironment.fillna('', inplace=True) job_groups['requirementsText'] = rome_texts.requirements job_groups.requirementsText.fillna('', inplace=True) # Add work environment items. rome_work_environments['domain'] = rome_work_environments['name'].map(sector_domains) job_groups['workEnvironmentKeywords'] = \ rome_work_environments.groupby('code_rome').apply(_group_work_environment_items) # Fill NaN with empty {}. job_groups['workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply( lambda k: k if isinstance(k, dict) else {}) # Add application modes. job_groups['applicationModes'] = application_modes job_groups['applicationModes'] = job_groups.applicationModes.apply( lambda m: m if isinstance(m, dict) else {}) # Add growth for the 2012-2022 period. job_groups['growth20122022'] = _get_growth_2012_2022( fap_growth_2012_2022, rome_fap_crosswalk_txt) job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001 job_groups['growth20122022'].fillna(0, inplace=True) # Set index as field. job_groups.index.name = 'romeId' job_groups.reset_index(inplace=True) job_groups['_id'] = job_groups['romeId'] return job_groups.to_dict('records')
def make_dicts( rome_csv_pattern: str, job_requirements_json: str, job_application_complexity_json: str, application_mode_csv: str, rome_fap_crosswalk_txt: str, handcrafted_assets_airtable: str, domains_airtable: str, strict_diplomas_airtable: str, info_by_prefix_airtable: str, fap_growth_2012_2022_csv: str, imt_market_score_csv: str, jobboards_airtable: Optional[str] = None, skills_for_future_airtable: Optional[str] = None, specific_to_job_airtable: Optional[str] = None) \ -> List[Dict[str, Any]]: """Import job info in MongoDB. Args: rome_csv_pattern: pattern of paths to CSV file containing the ROME data. It must contain a '{}' that will be replaced by 'referentiel_code_rome', 'referentiel_env_travail', 'liens_rome_referentiels' and 'referentiel_appellation'. job_requirements_json: path to a JSON file containing requirements per job group. job_application_complexity_json: path to a JSON file containing the application complexity of each job group. application_mode_csv: path to a CSV file containing the application mode data from emploi-store-dev API. rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk from FAP codes to ROME job group codes. handcrafted_assets_airtable: the base ID and the table named joined by a ':' of the AirTable containing the advice per job group (short texts describing assets required). domains_airtable: the base ID and the table name joined by a ':' of the AirTable containing the domain name for each sector. strict_diplomas_airtable: the base ID and the table name joined by a ':' of the AirTable which tells if a diploma is strictly required. info_by_prefix_airtable: the base ID and the table name joined by a ':' of the AirTable containing some manually specified info for group of job group (by ROME ID prefix). fap_growth_2012_2022_csv: path to a CSV file containing the growth of FAP job groups for the period 2012-2022. imt_market_score_csv: path to a CSV containing market score info from IMT. jobboards_airtable: the base ID and the table name joined by a ':' of the Airtable of the job boards. skills_for_future_airtable: the base ID and the table name joined by a ':' of the Airtable of the skills for the future. specific_to_job_airtable: the base ID and the table name joined by a ':' of the Airtable of the specific to job pieces advice. Returns: A list of dict that maps the JSON representation of JobGroup protos. """ job_groups = cleaned_data.rome_job_groups( filename=rome_csv_pattern.format('referentiel_code_rome')) jobs = cleaned_data.rome_jobs( filename=rome_csv_pattern.format('referentiel_appellation')) holland_codes = cleaned_data.rome_holland_codes( filename=rome_csv_pattern.format('referentiel_code_rome_riasec')) rome_texts = cleaned_data.rome_texts( filename=rome_csv_pattern.format('texte')) rome_work_environments = cleaned_data.rome_work_environments( links_filename=rome_csv_pattern.format('liens_rome_referentiels'), ref_filename=rome_csv_pattern.format('referentiel_env_travail')) handcrafted_assets = _load_assets_from_airtable( *handcrafted_assets_airtable.split(':')) sector_domains = _load_domains_from_airtable(*domains_airtable.split(':')) info_by_prefix = _load_prefix_info_from_airtable( job_groups.index, *info_by_prefix_airtable.split(':')) application_modes = _get_application_modes(application_mode_csv, rome_fap_crosswalk_txt) fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv) jobboards_by_rome = _load_items_from_airtable('JobBoard', job_groups.index, jobboards_airtable, 'for-job-group') skills_for_future_by_rome = _load_items_from_airtable( 'Skill', job_groups.index, skills_for_future_airtable, 'rome_prefixes') specific_to_job_by_rome = _load_items_from_airtable( 'DynamicAdvice', job_groups.index, specific_to_job_airtable, 'for-job-group') users_highest_degrees = _load_highest_degrees_from_mongo() # Genderize names. masculine, feminine = rome_genderization.genderize(jobs.name) jobs['masculineName'] = masculine jobs['feminineName'] = feminine # List jobs and pick samples. jobs.index.name = 'codeOgr' jobs.reset_index(inplace=True) jobs_grouped = jobs.groupby('code_rome') job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3)) job_groups['samples'] = job_groups.samples.apply( lambda s: s if isinstance(s, list) else []) job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None)) job_groups['jobs'] = job_groups.jobs.apply(lambda s: s if isinstance(s, list) else []) # Add info by prefix. job_groups = job_groups.join(info_by_prefix) # Combine requirements from json file. with open(job_requirements_json) as job_requirements_file: job_requirements_list = json.load(job_requirements_file) job_requirements_dict = { job_requirement.pop('_id'): job_requirement for job_requirement in job_requirements_list } job_groups['requirements'] = job_groups.index.map(job_requirements_dict) # Replace NaN by empty dicts. job_groups['requirements'] = job_groups.requirements.apply( lambda r: r if isinstance(r, dict) else {}) # Combine requirements from AirTable. for job_group in job_groups.itertuples(): job_group.requirements.update( handcrafted_assets.get(job_group.Index, {})) application_complexity = pandas.read_json(job_application_complexity_json) application_complexity.set_index('_id', inplace=True) job_groups['applicationComplexity'] = application_complexity[ 'applicationComplexity'] job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True) # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes. # Will later be used for job similarity measures. job_groups['hollandCodeMajor'] = holland_codes.major job_groups.hollandCodeMajor.fillna('', inplace=True) job_groups['hollandCodeMinor'] = holland_codes.minor job_groups.hollandCodeMinor.fillna('', inplace=True) # Add description, working environment and requirement as text. job_groups['description'] = rome_texts.definition job_groups.description.fillna('', inplace=True) job_groups['workingEnvironment'] = rome_texts.working_environment job_groups.workingEnvironment.fillna('', inplace=True) job_groups['requirementsText'] = rome_texts.requirements job_groups.requirementsText.fillna('', inplace=True) # Add work environment items. rome_work_environments['domain'] = rome_work_environments['name'].map( sector_domains) job_groups['workEnvironmentKeywords'] = \ rome_work_environments.groupby('code_rome').apply(_group_work_environment_items) # Fill NaN with empty {}. job_groups[ 'workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply( lambda k: k if isinstance(k, dict) else {}) # Add application modes. job_groups['applicationModes'] = application_modes job_groups['applicationModes'] = job_groups.applicationModes.apply( lambda m: m if isinstance(m, dict) else {}) # Add growth for the 2012-2022 period. job_groups['growth20122022'] = _get_growth_2012_2022( fap_growth_2012_2022, rome_fap_crosswalk_txt) job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001 job_groups['growth20122022'].fillna(0, inplace=True) # Add best departements. job_groups['departementScores'] = _get_less_stressful_departements_count( imt_market_score_csv) # Fill NaN with empty []. job_groups['departementScores'] = job_groups.departementScores.apply( lambda s: s if isinstance(s, list) else []) job_groups['bestDepartements'] = job_groups.departementScores.apply( lambda ds: ds[:11]) # Add national market score. job_groups['nationalMarketScore'] = _get_national_market_scores( imt_market_score_csv) job_groups['nationalMarketScore'].fillna(0, inplace=True) # Add diploma requirements. job_groups[ 'is_diploma_strictly_required'] = _load_strict_diplomas_from_airtable( *strict_diplomas_airtable.split(':')) job_groups['is_diploma_strictly_required'].fillna(False, inplace=True) # Add job_boards. if jobboards_by_rome: job_groups['jobBoards'] = job_groups.index.map(jobboards_by_rome) # Add skills for the future. if skills_for_future_by_rome: job_groups['skillsForFuture'] = job_groups.index.map( skills_for_future_by_rome) # Add specific to job advice. if specific_to_job_by_rome: job_groups['specificAdvice'] = job_groups.index.map( specific_to_job_by_rome) # Add highest degree counts from user base. if users_highest_degrees is not None: job_groups['userDegrees'] = users_highest_degrees # Fill NaN with empty []. job_groups['userDegrees'] = job_groups.userDegrees.apply( lambda d: d if isinstance(d, list) else []) # Set index as field. job_groups.index.name = 'romeId' job_groups.reset_index(inplace=True) job_groups['_id'] = job_groups['romeId'] return typing.cast(List[Dict[str, Any]], job_groups.to_dict('records'))
def make_dicts( *, rome_csv_pattern: str, application_mode_csv: Optional[str] = None, brookings_json: Optional[str] = None, domains_airtable: Optional[str] = None, fap_growth_2012_2022_csv: Optional[str] = None, handcrafted_assets_airtable: Optional[str] = None, imt_market_score_csv: Optional[str] = None, info_by_prefix_airtable: Optional[str] = None, jobboards_airtable: Optional[str] = None, job_application_complexity_json: Optional[str] = None, job_requirements_json: Optional[str] = None, rome_fap_crosswalk_txt: Optional[str] = None, rome_isco_crosswalk_xlsx: Optional[str] = None, skills_for_future_airtable: Optional[str] = None, soc_2010_xls: Optional[str] = None, soc_isco_crosswalk_xls: Optional[str] = None, specific_to_job_airtable: Optional[str] = None, strict_diplomas_airtable: Optional[str] = None, trainings_csv: Optional[str] = None, sampler_generator: Callable[[Optional[int]], Sampler] = _create_jobs_sampler) \ -> list[dict[str, Any]]: """Import job info in MongoDB. Args: rome_csv_pattern: pattern of paths to CSV file containing the ROME data. It must contain a '{}' that will be replaced by 'referentiel_code_rome', 'referentiel_env_travail', 'liens_rome_referentiels' and 'referentiel_appellation'. job_requirements_json: path to a JSON file containing requirements per job group. For `requirements`. job_application_complexity_json: path to a JSON file containing the application complexity of each job group. For `application_complexity`. application_mode_csv: path to a CSV file containing the application mode data from emploi-store-dev API. For `application_modes`. rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk from FAP codes to ROME job group codes. For `application_modes`, `growth_2012_2022`. handcrafted_assets_airtable: the base ID and the table named joined by a ':' of the AirTable containing the advice per job group (short texts describing assets required). For `requirements`. domains_airtable: the base ID and the table name joined by a ':' of the AirTable containing the domain name for each sector. For `work_environment_keywords`. strict_diplomas_airtable: the base ID and the table name joined by a ':' of the AirTable which tells if a diploma is strictly required. For `is_diploma_strictly_required`. info_by_prefix_airtable: the base ID and the table name joined by a ':' For `covidRisk`, `domain`, `hasFreelancers`, `inAWorkplace`, `inDomain`, `likeYourWorkplace`, `placePlural`, `preferredApplicationMedium`, `whatILoveAbout`, `toTheWorkplace`, `whySpecificCompany`, `atVariousCompanies`, `whatILoveAboutFeminine`. of the AirTable containing some manually specified info for group of job group (by ROME ID prefix). fap_growth_2012_2022_csv: path to a CSV file containing the growth of FAP job groups for the period 2012-2022. For `growth_2012_2022`. imt_market_score_csv: path to a CSV containing market score info from IMT. For `best_departements`, `departement_scores`, `nationam_market_score`. jobboards_airtable: the base ID and the table name joined by a ':' of the Airtable of the job boards. For `job_boards`. skills_for_future_airtable: the base ID and the table name joined by a ':' of the Airtable of the skills for the future. For `skills_for_future`. specific_to_job_airtable: the base ID and the table name joined by a ':' of the Airtable of the specific to job pieces advice. For `specific_advice` brookings_json: path to a JSON file with data from Brookings report for automation risk. For `automation_risk`. soc_2010_xls: path to an XLS file with the names of US SOC 2010 groups. For `automation_risk`. soc_isco_crosswalk_xls: path to an XLS file of the crosswalk btw US SOC 2010 and ISCO-08. For `automation_risk`. rome_isco_crosswalk_xlsx: path to an XLSX file of the crosswalk btw ROME and ISCO-08. For `automation_risk`. trainings_csv: path to a CSV with trainings data. For `training_count`. Returns: A list of dict that maps the JSON representation of JobGroup protos. """ job_groups = cleaned_data.rome_job_groups( filename=rome_csv_pattern.format('referentiel_code_rome')) jobs = cleaned_data.rome_jobs( filename=rome_csv_pattern.format('referentiel_appellation')) holland_codes = cleaned_data.rome_holland_codes( filename=rome_csv_pattern.format('referentiel_code_rome_riasec')) rome_texts = cleaned_data.rome_texts( filename=rome_csv_pattern.format('texte')) rome_work_environments = cleaned_data.rome_work_environments( links_filename=rome_csv_pattern.format('liens_rome_referentiels'), ref_filename=rome_csv_pattern.format('referentiel_env_travail')) handcrafted_assets = _load_assets_from_airtable(*handcrafted_assets_airtable.split(':')) \ if handcrafted_assets_airtable else {} sector_domains = _load_domains_from_airtable(*domains_airtable.split(':')) \ if domains_airtable else {} info_by_prefix = _load_prefix_info_from_airtable(job_groups.index, info_by_prefix_airtable) \ if info_by_prefix_airtable else None application_modes = _get_application_modes( application_mode_csv, rome_fap_crosswalk_txt, ) if application_mode_csv and rome_fap_crosswalk_txt else None fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv) if fap_growth_2012_2022_csv \ else None jobboards_by_rome = airtable_to_protos.load_items_from_prefix( 'JobBoard', job_groups.index, jobboards_airtable, 'for-job-group', ) if jobboards_airtable else None skills_for_future_by_rome = airtable_to_protos.load_items_from_prefix( 'Skill', job_groups.index, skills_for_future_airtable, 'rome_prefixes', ) if skills_for_future_airtable else None specific_to_job_by_rome = airtable_to_protos.load_items_from_prefix( 'DynamicAdvice', job_groups.index, specific_to_job_airtable, 'fr:for-job-group', ) if specific_to_job_airtable else None users_highest_degrees = _load_highest_degrees_from_mongo() # Genderize names. masculine, feminine = rome_genderization.genderize(jobs.name) jobs['masculineName'] = masculine jobs['feminineName'] = feminine # List jobs and pick samples. jobs.index.name = 'codeOgr' jobs.reset_index(inplace=True) jobs_grouped = jobs.groupby('code_rome') job_groups['samples'] = jobs_grouped.apply(sampler_generator(3)) job_groups['samples'] = job_groups.samples.apply( lambda s: s if isinstance(s, list) else []) job_groups['jobs'] = jobs_grouped.apply(sampler_generator(None)) job_groups['jobs'] = job_groups.jobs.apply( lambda s: s if isinstance(s, list) else []) # Add info by prefix. if info_by_prefix is not None: job_groups = job_groups.join(info_by_prefix) # Combine requirements from json file. if job_requirements_json: with open(job_requirements_json, encoding='utf-8') as job_requirements_file: job_requirements_list = json.load(job_requirements_file) job_requirements_dict = { job_requirement.pop('_id'): job_requirement for job_requirement in job_requirements_list} job_groups['requirements'] = job_groups.index.map(job_requirements_dict) # Replace NaN by empty dicts. job_groups['requirements'] = job_groups.requirements.apply( lambda r: r if isinstance(r, dict) else {}) # Combine requirements from AirTable. if handcrafted_assets: for job_group in job_groups.itertuples(): job_group.requirements.update(handcrafted_assets.get(job_group.Index, {})) if job_application_complexity_json: application_complexity = pandas.read_json(job_application_complexity_json) application_complexity.set_index('_id', inplace=True) job_groups['applicationComplexity'] = application_complexity['applicationComplexity'] job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True) # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes. # Will later be used for job similarity measures. job_groups['hollandCodeMajor'] = holland_codes.major job_groups.hollandCodeMajor.fillna('', inplace=True) job_groups['hollandCodeMinor'] = holland_codes.minor job_groups.hollandCodeMinor.fillna('', inplace=True) # Add description, working environment and requirement as text. job_groups['description'] = rome_texts.definition job_groups.description.fillna('', inplace=True) job_groups['workingEnvironment'] = rome_texts.working_environment job_groups.workingEnvironment.fillna('', inplace=True) job_groups['requirementsText'] = rome_texts.requirements job_groups.requirementsText.fillna('', inplace=True) # Add work environment items. if sector_domains: rome_work_environments['domain'] = rome_work_environments['name'].map(sector_domains) job_groups['workEnvironmentKeywords'] = \ rome_work_environments.groupby('code_rome').apply(_group_work_environment_items) # Fill NaN with empty {}. job_groups['workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply( lambda k: k if isinstance(k, dict) else {}) # Add application modes. if application_modes is not None: job_groups['applicationModes'] = application_modes job_groups['applicationModes'] = job_groups.applicationModes.apply( lambda m: m if isinstance(m, dict) else {}) # Add growth for the 2012-2022 period. if fap_growth_2012_2022 is not None and rome_fap_crosswalk_txt: job_groups['growth20122022'] = _get_growth_2012_2022( fap_growth_2012_2022, rome_fap_crosswalk_txt) job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001 job_groups['growth20122022'].fillna(0, inplace=True) # Add automation risk. if brookings_json and soc_2010_xls and soc_isco_crosswalk_xls and rome_isco_crosswalk_xlsx: job_groups['automationRisk'] = _get_automation_risk( brookings_json=brookings_json, soc_2010_xls=soc_2010_xls, soc_isco_crosswalk_xls=soc_isco_crosswalk_xls, rome_isco_crosswalk_xlsx=rome_isco_crosswalk_xlsx, ).mul(100).round(0).astype(int) # Mark 0 values as 1, as 0 means undefined. job_groups.loc[job_groups['automationRisk'] == 0, 'automationRisk'] = 1 job_groups['automationRisk'].fillna(0, inplace=True) # Add best departements. if imt_market_score_csv: market_scores = cleaned_data.market_scores(filename=imt_market_score_csv) market_scores = market_scores[market_scores.AREA_TYPE_CODE == 'D'].\ reset_index().\ drop([ 'market_score', 'yearly_avg_offers_denominator', 'AREA_TYPE_CODE', ], axis='columns').\ rename({ 'departement_id': 'district_id', 'rome_id': 'job_group', 'yearly_avg_offers_per_10_candidates': 'market_score', }, axis='columns') job_groups['departementScores'] = market_score_derivatives.get_less_stressful_districts( market_scores) # Fill NaN with empty []. job_groups['departementScores'] = job_groups.departementScores.apply( lambda s: s if isinstance(s, list) else []) # TODO(cyrille): Drop this, once we're sure it's no more used in server. job_groups['bestDepartements'] = job_groups.departementScores.apply(lambda ds: ds[:11]) # Add national market score. # TODO(cyrille): Add this in market_score_derivatives. job_groups['nationalMarketScore'] = _get_national_market_scores(imt_market_score_csv) job_groups['nationalMarketScore'].fillna(0, inplace=True) # Add diploma requirements. if strict_diplomas_airtable: job_groups['is_diploma_strictly_required'] = _load_strict_diplomas_from_airtable( *strict_diplomas_airtable.split(':')) job_groups['is_diploma_strictly_required'].fillna(False, inplace=True) # Add job_boards. if jobboards_by_rome: job_groups['jobBoards'] = job_groups.index.map(jobboards_by_rome) # Add skills for the future. if skills_for_future_by_rome: job_groups['skillsForFuture'] = job_groups.index.map(skills_for_future_by_rome) # Add specific to job advice. if specific_to_job_by_rome: job_groups['specificAdvice'] = job_groups.index.map(specific_to_job_by_rome) # Add highest degree counts from user base. if users_highest_degrees is not None: job_groups['userDegrees'] = users_highest_degrees # Fill NaN with empty []. job_groups['userDegrees'] = job_groups.userDegrees.apply( lambda d: d if isinstance(d, list) else []) # Add training data. if trainings_csv: trainings = pandas.read_csv(trainings_csv) job_groups['trainingCount'] = trainings.groupby('formation.proximiteRomes.code')\ .apply(_count_trainings) job_groups['trainingCount'] = job_groups.trainingCount.apply( lambda counts: counts if isinstance(counts, dict) else {}) # Add no-requirement flag. job_groups['hasAnyRequirements'] = cleaned_data.jobs_without_qualifications( filename=rome_csv_pattern.format('item_arborescence'))\ .no_requirements.map(lambda unused: 'FALSE') job_groups['hasAnyRequirements'].fillna('TRUE', inplace=True) # Set index as field. job_groups.index.name = 'romeId' job_groups.reset_index(inplace=True) job_groups['_id'] = job_groups['romeId'] return typing.cast(list[dict[str, Any]], job_groups.to_dict('records'))
def csv2dicts( market_score_csv, offers_csv, referentiel_code_rome_csv, rome_item_arborescence, referentiel_apellation_rome_csv): """Import reorient jobbing data per month per departement in MongoDB. Args: market_score_csv: path to a CSV file containing the market stress data. offers_csv: path to a CSV file containing the job offer data. rome_item_arborescence: path to a CSV file containing ROME item arborescence. referentiel_code_rome_csv: path to a CSV file containing job groups. """ # Get number of job offers in the last 2 years. jobs = cleaned_data.rome_jobs(filename=referentiel_apellation_rome_csv) jobs.index.name = 'codeOgr' masculine_job_names, feminine_job_names = (rome_genderization.genderize(jobs.name)) jobs['masculineName'] = masculine_job_names jobs['feminineName'] = feminine_job_names job_offers = pd.read_csv(offers_csv, dtype={'departement_code': str}) job_offers.rename(columns={ 'rome_profession_card_code': 'rome_id', 'departement_code': 'departement_id', }, inplace=True) # Strip job names. job_offers['codeOgr'] = job_offers.rome_profession_code.apply(lambda code: str(int(code))) job_offers['name'] = job_offers.codeOgr.map(jobs.name) job_offers['masculineName'] = job_offers.codeOgr.map(jobs['masculineName']) job_offers['feminineName'] = job_offers.codeOgr.map(jobs['feminineName']) # Trimming offers after the first january 2017 to have exactly 2 years of data. job_offers = job_offers[job_offers.creation_date < '2017-01-01'] # Get the number of job offers per job and per departement. job_offers_per_dep = job_offers.groupby( ['name', 'masculineName', 'feminineName', 'departement_id', 'rome_id'])\ .size()\ .to_frame('offers')\ .reset_index()\ .sort_values(by=['offers'], ascending=False) # Inside each job group only get the job with the most offers to give at least # one concrete example to the user. # TODO(marielaure): Check if we could benefit from proposing more than one job # name. best_job_in_group = job_offers_per_dep.groupby(['rome_id', 'departement_id'])\ .first()\ .reset_index() # Get market score and keep only jobs that have at least a market score # (offers per 10 candidates) of 4, as described here: # https://github.com/bayesimpact/bob-emploi-internal/blob/master/data_analysis/notebooks/research/reorientation/reorient_market_stress_skilless.ipynb market_score = pd.read_csv(market_score_csv, dtype={'AREA_CODE': str}) market_score.rename(columns={ 'AREA_CODE': 'departement_id', 'ROME_PROFESSION_CARD_CODE': 'rome_id', }, inplace=True) market_score_filtered = market_score[market_score.TENSION_RATIO >= 4] # Compute jobs without qualification. job_groups = pd.read_csv(referentiel_code_rome_csv) rome_item_arborescence = pd.read_csv(rome_item_arborescence) # The strrategy for filtering jobs without qualification is described here: # https://github.com/bayesimpact/bob-emploi-internal/blob/master/data_analysis/notebooks/research/jobbing/seasonal_offers.ipynb unqualification_jobs_index = '017' first_level = rome_item_arborescence[ rome_item_arborescence.code_pere == unqualification_jobs_index] second_level = rome_item_arborescence[ rome_item_arborescence.code_pere.isin(first_level.code_noeud)] job_groups_arborescence = pd.merge( second_level, job_groups, left_on=['code_item_arbo_associe'], right_on=['code_ogr']) job_groups_arborescence.rename(columns={'code_rome': 'rome_id'}, inplace=True) no_qualification_jobs = job_groups_arborescence[['rome_id', 'libelle_rome']] no_qualification_jobs_market = pd.merge( no_qualification_jobs, market_score_filtered, on='rome_id') departement_rome_market = no_qualification_jobs_market[ no_qualification_jobs_market.AREA_TYPE_NAME == 'Département'] rome_dep_with_best_job = pd.merge( best_job_in_group, departement_rome_market, on=['rome_id', 'departement_id'])[[ 'rome_id', 'departement_id', 'masculineName', 'feminineName', 'offers', 'name', 'TENSION_RATIO']] # Filter best jobs without qualification that have at least 50 offers out of the last # 2 years. rome_dep_with_best_job = rome_dep_with_best_job[ rome_dep_with_best_job.offers > _MIN_JOB_OFFERS]\ .rename(columns={'TENSION_RATIO': 'market_score'}) def _create_job_groups(jobs): return jobs[['name', 'masculineName', 'feminineName', 'rome_id', 'offers', 'market_score']]\ .to_dict(orient='records')[0] rome_dep_job_groups = rome_dep_with_best_job\ .groupby(['departement_id', 'rome_id', 'name', 'masculineName', 'feminineName', 'offers'])\ .apply(_create_job_groups)\ .to_frame('jobs')\ .reset_index()\ .rename(columns={'departement_id': '_id'}) def _create_jobbing_stats(jobs): return jobs.sort_values('offers', ascending=False)[['jobs']].head().to_dict(orient='list') jobbing_stats = rome_dep_job_groups\ .groupby('_id')\ .apply(_create_jobbing_stats)\ .to_frame('departementJobStats')\ .reset_index() return jobbing_stats.to_dict(orient='records')