コード例 #1
0
    def _genderize_lists(self, names):
        """Helper function to call genderize using lists instead of Series."""

        masculine, feminine = rome_genderization.genderize(pandas.Series(names))
        self.assertEqual(len(masculine), len(names))
        self.assertEqual(len(feminine), len(names))
        return masculine.tolist(), feminine.tolist()
コード例 #2
0
def csv2dicts(rome_csv_pattern):
    """Import the ROME mobility data in MongoDB.

    We group the mobility data as JobGroups (we find a set of similar jobs
    either for a specific job or for a job group).

    To get all mobility data for a given job, you have to to look both for the
    data for this job (keyed by OGR code) and for the data for its job group
    (keyed by ROME code). As OGR code and ROME code use different namespaces
    there's no conflict to use it directly with its key.

    Args:
        rome_csv_pattern: pattern of paths to CSV files containing the ROME
            data. It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'rubrique_mobilite' and
            'referentiel_appellation'.
    """

    mobility = pandas.read_csv(rome_csv_pattern.format('rubrique_mobilite'),
                               dtype=str)
    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))

    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    jobs.index.name = 'codeOgr'
    masculine_job_names, feminine_job_names = (rome_genderization.genderize(
        jobs.name))
    jobs['masculineName'] = masculine_job_names
    jobs['feminineName'] = feminine_job_names
    jobs_names = jobs.name
    jobs.reset_index(inplace=True)
    jobs_samples = jobs.groupby('code_rome').apply(_sample_jobs(3))

    mobility.rename(columns={
        'code_rome': 'source_job_group',
        'code_appellation_source': 'source_job',
        'code_rome_cible': 'target_job_group',
        'code_appellation_cible': 'target_job',
        'code_type_mobilite': 'mobility_type',
    },
                    inplace=True)

    mobility['target_job_group_name'] = (mobility.target_job_group.map(
        job_groups.name))
    mobility.target_job_group_name.fillna('', inplace=True)
    mobility['target_job_group_samples'] = (
        mobility.target_job_group.map(jobs_samples).fillna(False))
    mobility['target_job_name'] = mobility.target_job.map(jobs_names)
    mobility.target_job_name.fillna('', inplace=True)
    mobility['target_job_masculine_name'] = mobility.target_job.map(
        masculine_job_names)
    mobility.target_job_masculine_name.fillna('', inplace=True)
    mobility['target_job_feminine_name'] = mobility.target_job.map(
        feminine_job_names)
    mobility.target_job_feminine_name.fillna('', inplace=True)

    return dataframe2dicts(mobility)
コード例 #3
0
def main(rome_appellation_csv: str, output_txt: str) -> None:
    """Sample ROME jobs in job groups.

    Args:
        rome_appellation_csv: path to a CSV file containing all ROME jobs.
        output_txt: path where to create the output txt file. It will get
            populated with a list of masculine job names, one per line.
    """

    jobs = cleaned_data.rome_jobs(filename=rome_appellation_csv)
    samples = jobs.groupby('code_rome').apply(lambda d: d.sample(1))
    names, unused_ = rome_genderization.genderize(samples.name)
    with open(output_txt, 'w') as output:
        output.write('\n'.join(names.tolist()) + '\n')
コード例 #4
0
def _genderize(
    data_frame: pandas.DataFrame,
    field: str,
    suffixes: Tuple[str, str] = ('_masculin', '_feminin')) -> None:
    """Update a pandas DataFrame by genderizing one if its column.

    Args:
        data_frame: the DataFrame to update.
        field: the name of the column to genderize.
        suffixes: the suffixes of the new column to create.
    """

    masculine, feminine = rome_genderization.genderize(data_frame[field])
    data_frame[field + suffixes[0]] = masculine
    data_frame[field + suffixes[1]] = feminine
コード例 #5
0
def make_dicts(
        rome_csv_pattern,
        job_requirements_json,
        job_application_complexity_json,
        application_mode_csv,
        rome_fap_crosswalk_txt,
        handcrafted_assets_airtable,
        domains_airtable,
        info_by_prefix_airtable,
        fap_growth_2012_2022_csv):
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
        fap_growth_2012_2022_csv: path to a CSV file containing the growth of
            FAP job groups for the period 2012-2022.
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """

    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(*handcrafted_assets_airtable.split(':'))
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':'))
    info_by_prefix = _load_prefix_info_from_airtable(
        job_groups.index, *info_by_prefix_airtable.split(':'))
    application_modes = _get_application_modes(
        application_mode_csv, rome_fap_crosswalk_txt)
    fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv)

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None))
    job_groups['jobs'] = job_groups.jobs.apply(
        lambda s: s if isinstance(s, list) else [])

    # Add info by prefix.
    job_groups = job_groups.join(info_by_prefix)

    # Add skills.
    rome_to_skills = cleaned_data.rome_to_skills(
        filename_items=rome_csv_pattern.format('coherence_item'),
        filename_skills=rome_csv_pattern.format('referentiel_competence'))
    skills_grouped = rome_to_skills.groupby('code_rome')
    job_groups['requirements'] = skills_grouped.apply(
        _group_skills_as_proto_list)
    # Replace NaN by empty dicts.
    job_groups['requirements'] = job_groups.requirements.apply(
        lambda r: r if isinstance(r, dict) else {})

    # Combine requirements from json file.
    with open(job_requirements_json) as job_requirements_file:
        job_requirements_list = json.load(job_requirements_file)
        job_requirements_dict = {
            job_requirement.pop('_id'): job_requirement
            for job_requirement in job_requirements_list}
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            job_requirements_dict.get(job_group.Index, {}))

    # Combine requirements from AirTable.
    for job_group in job_groups.itertuples():
        job_group.requirements.update(handcrafted_assets.get(job_group.Index, {}))

    application_complexity = pandas.read_json(job_application_complexity_json)
    application_complexity.set_index('_id', inplace=True)
    job_groups['applicationComplexity'] = application_complexity['applicationComplexity']
    job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    rome_work_environments['domain'] = rome_work_environments['name'].map(sector_domains)
    job_groups['workEnvironmentKeywords'] = \
        rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
    # Fill NaN with empty {}.
    job_groups['workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
        lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    job_groups['applicationModes'] = application_modes
    job_groups['applicationModes'] = job_groups.applicationModes.apply(
        lambda m: m if isinstance(m, dict) else {})

    # Add growth for the 2012-2022 period.
    job_groups['growth20122022'] = _get_growth_2012_2022(
        fap_growth_2012_2022, rome_fap_crosswalk_txt)
    job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001
    job_groups['growth20122022'].fillna(0, inplace=True)

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return job_groups.to_dict('records')
コード例 #6
0
ファイル: job_group_info.py プロジェクト: b3rday/bob-emploi
def make_dicts(
        rome_csv_pattern: str,
        job_requirements_json: str,
        job_application_complexity_json: str,
        application_mode_csv: str,
        rome_fap_crosswalk_txt: str,
        handcrafted_assets_airtable: str,
        domains_airtable: str,
        strict_diplomas_airtable: str,
        info_by_prefix_airtable: str,
        fap_growth_2012_2022_csv: str,
        imt_market_score_csv: str,
        jobboards_airtable: Optional[str] = None,
        skills_for_future_airtable: Optional[str] = None,
        specific_to_job_airtable: Optional[str] = None) \
        -> List[Dict[str, Any]]:
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
        strict_diplomas_airtable: the base ID and the table name joined by a ':' of the
            AirTable which tells if a diploma is strictly required.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
        fap_growth_2012_2022_csv: path to a CSV file containing the growth of
            FAP job groups for the period 2012-2022.
        imt_market_score_csv: path to a CSV containing market score info from IMT.
        jobboards_airtable: the base ID and the table name joined by a ':' of the Airtable of the
            job boards.
        skills_for_future_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the skills for the future.
        specific_to_job_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the specific to job pieces advice.
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """

    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(
        *handcrafted_assets_airtable.split(':'))
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':'))
    info_by_prefix = _load_prefix_info_from_airtable(
        job_groups.index, *info_by_prefix_airtable.split(':'))
    application_modes = _get_application_modes(application_mode_csv,
                                               rome_fap_crosswalk_txt)
    fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv)
    jobboards_by_rome = _load_items_from_airtable('JobBoard', job_groups.index,
                                                  jobboards_airtable,
                                                  'for-job-group')
    skills_for_future_by_rome = _load_items_from_airtable(
        'Skill', job_groups.index, skills_for_future_airtable, 'rome_prefixes')
    specific_to_job_by_rome = _load_items_from_airtable(
        'DynamicAdvice', job_groups.index, specific_to_job_airtable,
        'for-job-group')
    users_highest_degrees = _load_highest_degrees_from_mongo()

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(_create_jobs_sampler(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(_create_jobs_sampler(None))
    job_groups['jobs'] = job_groups.jobs.apply(lambda s: s
                                               if isinstance(s, list) else [])

    # Add info by prefix.
    job_groups = job_groups.join(info_by_prefix)

    # Combine requirements from json file.
    with open(job_requirements_json) as job_requirements_file:
        job_requirements_list = json.load(job_requirements_file)
        job_requirements_dict = {
            job_requirement.pop('_id'): job_requirement
            for job_requirement in job_requirements_list
        }
    job_groups['requirements'] = job_groups.index.map(job_requirements_dict)
    # Replace NaN by empty dicts.
    job_groups['requirements'] = job_groups.requirements.apply(
        lambda r: r if isinstance(r, dict) else {})

    # Combine requirements from AirTable.
    for job_group in job_groups.itertuples():
        job_group.requirements.update(
            handcrafted_assets.get(job_group.Index, {}))

    application_complexity = pandas.read_json(job_application_complexity_json)
    application_complexity.set_index('_id', inplace=True)
    job_groups['applicationComplexity'] = application_complexity[
        'applicationComplexity']
    job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY',
                                            inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    rome_work_environments['domain'] = rome_work_environments['name'].map(
        sector_domains)
    job_groups['workEnvironmentKeywords'] = \
        rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
    # Fill NaN with empty {}.
    job_groups[
        'workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
            lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    job_groups['applicationModes'] = application_modes
    job_groups['applicationModes'] = job_groups.applicationModes.apply(
        lambda m: m if isinstance(m, dict) else {})

    # Add growth for the 2012-2022 period.
    job_groups['growth20122022'] = _get_growth_2012_2022(
        fap_growth_2012_2022, rome_fap_crosswalk_txt)
    job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001
    job_groups['growth20122022'].fillna(0, inplace=True)

    # Add best departements.
    job_groups['departementScores'] = _get_less_stressful_departements_count(
        imt_market_score_csv)
    # Fill NaN with empty [].
    job_groups['departementScores'] = job_groups.departementScores.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['bestDepartements'] = job_groups.departementScores.apply(
        lambda ds: ds[:11])

    # Add national market score.
    job_groups['nationalMarketScore'] = _get_national_market_scores(
        imt_market_score_csv)
    job_groups['nationalMarketScore'].fillna(0, inplace=True)

    # Add diploma requirements.
    job_groups[
        'is_diploma_strictly_required'] = _load_strict_diplomas_from_airtable(
            *strict_diplomas_airtable.split(':'))
    job_groups['is_diploma_strictly_required'].fillna(False, inplace=True)

    # Add job_boards.
    if jobboards_by_rome:
        job_groups['jobBoards'] = job_groups.index.map(jobboards_by_rome)

    # Add skills for the future.
    if skills_for_future_by_rome:
        job_groups['skillsForFuture'] = job_groups.index.map(
            skills_for_future_by_rome)

    # Add specific to job advice.
    if specific_to_job_by_rome:
        job_groups['specificAdvice'] = job_groups.index.map(
            specific_to_job_by_rome)

    # Add highest degree counts from user base.
    if users_highest_degrees is not None:
        job_groups['userDegrees'] = users_highest_degrees
        # Fill NaN with empty [].
        job_groups['userDegrees'] = job_groups.userDegrees.apply(
            lambda d: d if isinstance(d, list) else [])

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return typing.cast(List[Dict[str, Any]], job_groups.to_dict('records'))
コード例 #7
0
def make_dicts(
        *,
        rome_csv_pattern: str,
        application_mode_csv: Optional[str] = None,
        brookings_json: Optional[str] = None,
        domains_airtable: Optional[str] = None,
        fap_growth_2012_2022_csv: Optional[str] = None,
        handcrafted_assets_airtable: Optional[str] = None,
        imt_market_score_csv: Optional[str] = None,
        info_by_prefix_airtable: Optional[str] = None,
        jobboards_airtable: Optional[str] = None,
        job_application_complexity_json: Optional[str] = None,
        job_requirements_json: Optional[str] = None,
        rome_fap_crosswalk_txt: Optional[str] = None,
        rome_isco_crosswalk_xlsx: Optional[str] = None,
        skills_for_future_airtable: Optional[str] = None,
        soc_2010_xls: Optional[str] = None,
        soc_isco_crosswalk_xls: Optional[str] = None,
        specific_to_job_airtable: Optional[str] = None,
        strict_diplomas_airtable: Optional[str] = None,
        trainings_csv: Optional[str] = None,
        sampler_generator: Callable[[Optional[int]], Sampler] = _create_jobs_sampler) \
        -> list[dict[str, Any]]:
    """Import job info in MongoDB.

    Args:
        rome_csv_pattern: pattern of paths to CSV file containing the ROME data.
            It must contain a '{}' that will be replaced by
            'referentiel_code_rome', 'referentiel_env_travail',
            'liens_rome_referentiels' and 'referentiel_appellation'.
        job_requirements_json: path to a JSON file containing requirements per
            job group.
                For `requirements`.
        job_application_complexity_json: path to a JSON file containing the
            application complexity of each job group.
                For `application_complexity`.
        application_mode_csv: path to a CSV file containing the application mode
            data from emploi-store-dev API.
                For `application_modes`.
        rome_fap_crosswalk_txt: path to a TXT file containing the crosswalk
            from FAP codes to ROME job group codes.
                For `application_modes`, `growth_2012_2022`.
        handcrafted_assets_airtable: the base ID and the table named joined by
            a ':' of the AirTable containing the advice per job group (short
            texts describing assets required).
                For `requirements`.
        domains_airtable: the base ID and the table name joined by a ':' of the
            AirTable containing the domain name for each sector.
                For `work_environment_keywords`.
        strict_diplomas_airtable: the base ID and the table name joined by a ':' of the
            AirTable which tells if a diploma is strictly required.
                For `is_diploma_strictly_required`.
        info_by_prefix_airtable: the base ID and the table name joined by a ':'
                For `covidRisk`, `domain`, `hasFreelancers`, `inAWorkplace`, `inDomain`,
                `likeYourWorkplace`, `placePlural`, `preferredApplicationMedium`, `whatILoveAbout`,
                `toTheWorkplace`, `whySpecificCompany`, `atVariousCompanies`,
                `whatILoveAboutFeminine`.
            of the AirTable containing some manually specified info for group of
            job group (by ROME ID prefix).
        fap_growth_2012_2022_csv: path to a CSV file containing the growth of
            FAP job groups for the period 2012-2022.
                For `growth_2012_2022`.
        imt_market_score_csv: path to a CSV containing market score info from IMT.
                For `best_departements`, `departement_scores`, `nationam_market_score`.
        jobboards_airtable: the base ID and the table name joined by a ':' of the Airtable of the
            job boards.
                For `job_boards`.
        skills_for_future_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the skills for the future.
                For `skills_for_future`.
        specific_to_job_airtable: the base ID and the table name joined by a ':' of the Airtable
            of the specific to job pieces advice.
                For `specific_advice`
        brookings_json: path to a JSON file with data from Brookings report for automation risk.
                For `automation_risk`.
        soc_2010_xls: path to an XLS file with the names of US SOC 2010 groups.
                For `automation_risk`.
        soc_isco_crosswalk_xls: path to an XLS file of the crosswalk btw US SOC 2010 and ISCO-08.
                For `automation_risk`.
        rome_isco_crosswalk_xlsx: path to an XLSX file of the crosswalk btw ROME and ISCO-08.
                For `automation_risk`.
        trainings_csv: path to a CSV with trainings data.
                For `training_count`.
    Returns:
        A list of dict that maps the JSON representation of JobGroup protos.
    """

    job_groups = cleaned_data.rome_job_groups(
        filename=rome_csv_pattern.format('referentiel_code_rome'))
    jobs = cleaned_data.rome_jobs(
        filename=rome_csv_pattern.format('referentiel_appellation'))
    holland_codes = cleaned_data.rome_holland_codes(
        filename=rome_csv_pattern.format('referentiel_code_rome_riasec'))
    rome_texts = cleaned_data.rome_texts(
        filename=rome_csv_pattern.format('texte'))
    rome_work_environments = cleaned_data.rome_work_environments(
        links_filename=rome_csv_pattern.format('liens_rome_referentiels'),
        ref_filename=rome_csv_pattern.format('referentiel_env_travail'))
    handcrafted_assets = _load_assets_from_airtable(*handcrafted_assets_airtable.split(':')) \
        if handcrafted_assets_airtable else {}
    sector_domains = _load_domains_from_airtable(*domains_airtable.split(':')) \
        if domains_airtable else {}
    info_by_prefix = _load_prefix_info_from_airtable(job_groups.index, info_by_prefix_airtable) \
        if info_by_prefix_airtable else None
    application_modes = _get_application_modes(
        application_mode_csv, rome_fap_crosswalk_txt,
    ) if application_mode_csv and rome_fap_crosswalk_txt else None
    fap_growth_2012_2022 = pandas.read_csv(fap_growth_2012_2022_csv) if fap_growth_2012_2022_csv \
        else None
    jobboards_by_rome = airtable_to_protos.load_items_from_prefix(
        'JobBoard', job_groups.index, jobboards_airtable, 'for-job-group',
    ) if jobboards_airtable else None
    skills_for_future_by_rome = airtable_to_protos.load_items_from_prefix(
        'Skill', job_groups.index, skills_for_future_airtable, 'rome_prefixes',
    ) if skills_for_future_airtable else None
    specific_to_job_by_rome = airtable_to_protos.load_items_from_prefix(
        'DynamicAdvice', job_groups.index, specific_to_job_airtable, 'fr:for-job-group',
    ) if specific_to_job_airtable else None
    users_highest_degrees = _load_highest_degrees_from_mongo()

    # Genderize names.
    masculine, feminine = rome_genderization.genderize(jobs.name)
    jobs['masculineName'] = masculine
    jobs['feminineName'] = feminine

    # List jobs and pick samples.
    jobs.index.name = 'codeOgr'
    jobs.reset_index(inplace=True)
    jobs_grouped = jobs.groupby('code_rome')
    job_groups['samples'] = jobs_grouped.apply(sampler_generator(3))
    job_groups['samples'] = job_groups.samples.apply(
        lambda s: s if isinstance(s, list) else [])
    job_groups['jobs'] = jobs_grouped.apply(sampler_generator(None))
    job_groups['jobs'] = job_groups.jobs.apply(
        lambda s: s if isinstance(s, list) else [])

    # Add info by prefix.
    if info_by_prefix is not None:
        job_groups = job_groups.join(info_by_prefix)

    # Combine requirements from json file.
    if job_requirements_json:
        with open(job_requirements_json, encoding='utf-8') as job_requirements_file:
            job_requirements_list = json.load(job_requirements_file)
            job_requirements_dict = {
                job_requirement.pop('_id'): job_requirement
                for job_requirement in job_requirements_list}
        job_groups['requirements'] = job_groups.index.map(job_requirements_dict)
        # Replace NaN by empty dicts.
        job_groups['requirements'] = job_groups.requirements.apply(
            lambda r: r if isinstance(r, dict) else {})

        # Combine requirements from AirTable.
        if handcrafted_assets:
            for job_group in job_groups.itertuples():
                job_group.requirements.update(handcrafted_assets.get(job_group.Index, {}))

    if job_application_complexity_json:
        application_complexity = pandas.read_json(job_application_complexity_json)
        application_complexity.set_index('_id', inplace=True)
        job_groups['applicationComplexity'] = application_complexity['applicationComplexity']
        job_groups.applicationComplexity.fillna('UNKNOWN_APPLICATION_COMPLEXITY', inplace=True)

    # Add Hollande Code https://en.wikipedia.org/wiki/Holland_Codes.
    # Will later be used for job similarity measures.
    job_groups['hollandCodeMajor'] = holland_codes.major
    job_groups.hollandCodeMajor.fillna('', inplace=True)
    job_groups['hollandCodeMinor'] = holland_codes.minor
    job_groups.hollandCodeMinor.fillna('', inplace=True)

    # Add description, working environment and requirement as text.
    job_groups['description'] = rome_texts.definition
    job_groups.description.fillna('', inplace=True)
    job_groups['workingEnvironment'] = rome_texts.working_environment
    job_groups.workingEnvironment.fillna('', inplace=True)
    job_groups['requirementsText'] = rome_texts.requirements
    job_groups.requirementsText.fillna('', inplace=True)

    # Add work environment items.
    if sector_domains:
        rome_work_environments['domain'] = rome_work_environments['name'].map(sector_domains)
        job_groups['workEnvironmentKeywords'] = \
            rome_work_environments.groupby('code_rome').apply(_group_work_environment_items)
        # Fill NaN with empty {}.
        job_groups['workEnvironmentKeywords'] = job_groups.workEnvironmentKeywords.apply(
            lambda k: k if isinstance(k, dict) else {})

    # Add application modes.
    if application_modes is not None:
        job_groups['applicationModes'] = application_modes
        job_groups['applicationModes'] = job_groups.applicationModes.apply(
            lambda m: m if isinstance(m, dict) else {})

    # Add growth for the 2012-2022 period.
    if fap_growth_2012_2022 is not None and rome_fap_crosswalk_txt:
        job_groups['growth20122022'] = _get_growth_2012_2022(
            fap_growth_2012_2022, rome_fap_crosswalk_txt)
        job_groups.loc[job_groups.growth20122022 == 0, 'growth20122022'] = .000001
        job_groups['growth20122022'].fillna(0, inplace=True)

    # Add automation risk.
    if brookings_json and soc_2010_xls and soc_isco_crosswalk_xls and rome_isco_crosswalk_xlsx:
        job_groups['automationRisk'] = _get_automation_risk(
            brookings_json=brookings_json,
            soc_2010_xls=soc_2010_xls,
            soc_isco_crosswalk_xls=soc_isco_crosswalk_xls,
            rome_isco_crosswalk_xlsx=rome_isco_crosswalk_xlsx,
        ).mul(100).round(0).astype(int)
        # Mark 0 values as 1, as 0 means undefined.
        job_groups.loc[job_groups['automationRisk'] == 0, 'automationRisk'] = 1
        job_groups['automationRisk'].fillna(0, inplace=True)

    # Add best departements.
    if imt_market_score_csv:
        market_scores = cleaned_data.market_scores(filename=imt_market_score_csv)
        market_scores = market_scores[market_scores.AREA_TYPE_CODE == 'D'].\
            reset_index().\
            drop([
                'market_score',
                'yearly_avg_offers_denominator',
                'AREA_TYPE_CODE',
            ], axis='columns').\
            rename({
                'departement_id': 'district_id',
                'rome_id': 'job_group',
                'yearly_avg_offers_per_10_candidates': 'market_score',
            }, axis='columns')
        job_groups['departementScores'] = market_score_derivatives.get_less_stressful_districts(
            market_scores)
        # Fill NaN with empty [].
        job_groups['departementScores'] = job_groups.departementScores.apply(
            lambda s: s if isinstance(s, list) else [])
        # TODO(cyrille): Drop this, once we're sure it's no more used in server.
        job_groups['bestDepartements'] = job_groups.departementScores.apply(lambda ds: ds[:11])

        # Add national market score.
        # TODO(cyrille): Add this in market_score_derivatives.
        job_groups['nationalMarketScore'] = _get_national_market_scores(imt_market_score_csv)
        job_groups['nationalMarketScore'].fillna(0, inplace=True)

    # Add diploma requirements.
    if strict_diplomas_airtable:
        job_groups['is_diploma_strictly_required'] = _load_strict_diplomas_from_airtable(
            *strict_diplomas_airtable.split(':'))
        job_groups['is_diploma_strictly_required'].fillna(False, inplace=True)

    # Add job_boards.
    if jobboards_by_rome:
        job_groups['jobBoards'] = job_groups.index.map(jobboards_by_rome)

    # Add skills for the future.
    if skills_for_future_by_rome:
        job_groups['skillsForFuture'] = job_groups.index.map(skills_for_future_by_rome)

    # Add specific to job advice.
    if specific_to_job_by_rome:
        job_groups['specificAdvice'] = job_groups.index.map(specific_to_job_by_rome)

    # Add highest degree counts from user base.
    if users_highest_degrees is not None:
        job_groups['userDegrees'] = users_highest_degrees
        # Fill NaN with empty [].
        job_groups['userDegrees'] = job_groups.userDegrees.apply(
            lambda d: d if isinstance(d, list) else [])

    # Add training data.
    if trainings_csv:
        trainings = pandas.read_csv(trainings_csv)
        job_groups['trainingCount'] = trainings.groupby('formation.proximiteRomes.code')\
            .apply(_count_trainings)
        job_groups['trainingCount'] = job_groups.trainingCount.apply(
            lambda counts: counts if isinstance(counts, dict) else {})

    # Add no-requirement flag.
    job_groups['hasAnyRequirements'] = cleaned_data.jobs_without_qualifications(
        filename=rome_csv_pattern.format('item_arborescence'))\
        .no_requirements.map(lambda unused: 'FALSE')
    job_groups['hasAnyRequirements'].fillna('TRUE', inplace=True)

    # Set index as field.
    job_groups.index.name = 'romeId'
    job_groups.reset_index(inplace=True)
    job_groups['_id'] = job_groups['romeId']

    return typing.cast(list[dict[str, Any]], job_groups.to_dict('records'))
コード例 #8
0
def csv2dicts(
        market_score_csv, offers_csv, referentiel_code_rome_csv, rome_item_arborescence,
        referentiel_apellation_rome_csv):
    """Import reorient jobbing data per month per departement in MongoDB.

    Args:
        market_score_csv: path to a CSV file containing the market stress data.
        offers_csv: path to a CSV file containing the job offer data.
        rome_item_arborescence: path to a CSV file containing ROME item arborescence.
        referentiel_code_rome_csv: path to a CSV file containing job groups.
    """

    # Get number of job offers in the last 2 years.
    jobs = cleaned_data.rome_jobs(filename=referentiel_apellation_rome_csv)
    jobs.index.name = 'codeOgr'
    masculine_job_names, feminine_job_names = (rome_genderization.genderize(jobs.name))
    jobs['masculineName'] = masculine_job_names
    jobs['feminineName'] = feminine_job_names
    job_offers = pd.read_csv(offers_csv, dtype={'departement_code': str})
    job_offers.rename(columns={
        'rome_profession_card_code': 'rome_id',
        'departement_code': 'departement_id',
    }, inplace=True)

    # Strip job names.
    job_offers['codeOgr'] = job_offers.rome_profession_code.apply(lambda code: str(int(code)))
    job_offers['name'] = job_offers.codeOgr.map(jobs.name)
    job_offers['masculineName'] = job_offers.codeOgr.map(jobs['masculineName'])
    job_offers['feminineName'] = job_offers.codeOgr.map(jobs['feminineName'])

    # Trimming offers after the first january 2017 to have exactly 2 years of data.
    job_offers = job_offers[job_offers.creation_date < '2017-01-01']

    # Get the number of job offers per job and per departement.
    job_offers_per_dep = job_offers.groupby(
        ['name', 'masculineName', 'feminineName', 'departement_id', 'rome_id'])\
        .size()\
        .to_frame('offers')\
        .reset_index()\
        .sort_values(by=['offers'], ascending=False)

    # Inside each job group only get the job with the most offers to give at least
    # one concrete example to the user.
    # TODO(marielaure): Check if we could benefit from proposing more than one job
    # name.
    best_job_in_group = job_offers_per_dep.groupby(['rome_id', 'departement_id'])\
        .first()\
        .reset_index()

    # Get market score and keep only jobs that have at least a market score
    # (offers per 10 candidates) of 4, as described here:
    # https://github.com/bayesimpact/bob-emploi-internal/blob/master/data_analysis/notebooks/research/reorientation/reorient_market_stress_skilless.ipynb
    market_score = pd.read_csv(market_score_csv, dtype={'AREA_CODE': str})
    market_score.rename(columns={
        'AREA_CODE': 'departement_id',
        'ROME_PROFESSION_CARD_CODE': 'rome_id',
    }, inplace=True)
    market_score_filtered = market_score[market_score.TENSION_RATIO >= 4]

    # Compute jobs without qualification.
    job_groups = pd.read_csv(referentiel_code_rome_csv)
    rome_item_arborescence = pd.read_csv(rome_item_arborescence)
    # The strrategy for filtering jobs without qualification is described here:
    # https://github.com/bayesimpact/bob-emploi-internal/blob/master/data_analysis/notebooks/research/jobbing/seasonal_offers.ipynb
    unqualification_jobs_index = '017'
    first_level = rome_item_arborescence[
        rome_item_arborescence.code_pere == unqualification_jobs_index]
    second_level = rome_item_arborescence[
        rome_item_arborescence.code_pere.isin(first_level.code_noeud)]
    job_groups_arborescence = pd.merge(
        second_level, job_groups, left_on=['code_item_arbo_associe'], right_on=['code_ogr'])
    job_groups_arborescence.rename(columns={'code_rome': 'rome_id'}, inplace=True)
    no_qualification_jobs = job_groups_arborescence[['rome_id', 'libelle_rome']]
    no_qualification_jobs_market = pd.merge(
        no_qualification_jobs, market_score_filtered, on='rome_id')

    departement_rome_market = no_qualification_jobs_market[
        no_qualification_jobs_market.AREA_TYPE_NAME == 'Département']
    rome_dep_with_best_job = pd.merge(
        best_job_in_group, departement_rome_market, on=['rome_id', 'departement_id'])[[
            'rome_id', 'departement_id', 'masculineName', 'feminineName', 'offers',
            'name', 'TENSION_RATIO']]

    # Filter best jobs without qualification that have at least 50 offers out of the last
    # 2 years.
    rome_dep_with_best_job = rome_dep_with_best_job[
        rome_dep_with_best_job.offers > _MIN_JOB_OFFERS]\
        .rename(columns={'TENSION_RATIO': 'market_score'})

    def _create_job_groups(jobs):
        return jobs[['name', 'masculineName', 'feminineName', 'rome_id', 'offers', 'market_score']]\
            .to_dict(orient='records')[0]

    rome_dep_job_groups = rome_dep_with_best_job\
        .groupby(['departement_id', 'rome_id', 'name', 'masculineName', 'feminineName', 'offers'])\
        .apply(_create_job_groups)\
        .to_frame('jobs')\
        .reset_index()\
        .rename(columns={'departement_id': '_id'})

    def _create_jobbing_stats(jobs):
        return jobs.sort_values('offers', ascending=False)[['jobs']].head().to_dict(orient='list')

    jobbing_stats = rome_dep_job_groups\
        .groupby('_id')\
        .apply(_create_jobbing_stats)\
        .to_frame('departementJobStats')\
        .reset_index()

    return jobbing_stats.to_dict(orient='records')