Example #1
0
def _trim_job_offers_csv(args: argparse.Namespace,
                         out: Optional[TextIO]) -> None:

    fieldnames = args.fields.split(',')
    all_job_offers = job_offers.iterate(args.in_csv,
                                        args.colnames_txt,
                                        required_fields=set(fieldnames +
                                                            ['creation_date']))

    number_offers_estimate = 8500000

    with open(args.out_csv, 'w') as out_file:
        writer = csv.DictWriter(out_file, fieldnames=fieldnames)
        writer.writeheader()

        trim_date_fields: Set[str] = set()
        if args.trim_dates:
            trim_date_fields = {
                field
                for field in fieldnames
                if field.startswith('date_') or field.endswith('_date')
            }

        for job_offer in tqdm.tqdm(all_job_offers,
                                   total=number_offers_estimate,
                                   file=out):
            if job_offer.creation_date < args.min_creation_date:
                continue
            row = {field: getattr(job_offer, field) for field in fieldnames}
            for field in trim_date_fields:
                row[field] = row[field][:10]
            writer.writerow(row)
Example #2
0
def trim_job_offers_csv(in_csv,
                        colnames_txt,
                        out_csv,
                        min_creation_date='',
                        fields=_DEFAULT_FIELDS,
                        out=None):
    """Trim job offers CSV.

    Args:
        in_csv: the path of the CSV file containing all job offers in the Pôle
            Emploi format (using latin-1 encoding, | separators, etc).
        colnames_txt: the TXT file containing the list of column names.
        out_csv: the path where to store the output CSV file.
        fields: the list of fields to keep, separated by commas.
    """

    fieldnames = fields.split(',')
    all_job_offers = job_offers.iterate(in_csv,
                                        colnames_txt,
                                        required_fields=set(fieldnames +
                                                            ['creation_date']))

    number_offers_estimate = 8500000

    with open(out_csv, 'w') as out_file:
        writer = csv.DictWriter(out_file, fieldnames=fieldnames)
        writer.writeheader()
        for job_offer in tqdm.tqdm(all_job_offers,
                                   total=number_offers_estimate,
                                   file=out):
            if job_offer.creation_date < min_creation_date:
                continue
            writer.writerow(
                {field: getattr(job_offer, field)
                 for field in fieldnames})
Example #3
0
    def test_missing_required_fields(self):
        """Test missing required field."""

        offers = job_offers.iterate(path.join(self.testdata_folder,
                                              'job_offers.csv'),
                                    path.join(self.testdata_folder,
                                              'column_names.txt'),
                                    required_fields=set(['foobar']))
        self.assertRaises(ValueError, next, offers)
Example #4
0
    def test_basic(self):
        """Test basic usage."""

        offers = list(
            job_offers.iterate(
                path.join(self.testdata_folder, 'job_offers.csv'),
                path.join(self.testdata_folder, 'column_names.txt')))
        # Golden values.
        self.assertEqual(9, len(offers))
        self.assertEqual('000053Q', offers[1].id_offre)
        self.assertEqual('Contrat travail', offers[2].contract_nature_name)
Example #5
0
def extract_offers_per_cities(
        offers_file: str, colnames: str, min_creation_date: str, data_folder: str = 'data') \
        -> list[dict[str, Any]]:
    """Extract the interesting cities in terms of number of offers for each job group.

    Args:
        offers_file: path of cvs file with offers.
        colnames: the names of the columns in the offer file.
        min_creation_date: the date from which we consider the offers.
    """

    required_fields = {
        _CITY_CODE_FIELD, _JOB_GROUP_CODE_FIELD, _LATITUDE_CODE_FIELD,
        _LONGITUDE_CODE_FIELD, _CITY_NAME_CODE_FIELD
    }

    offers_rows = job_offers.iterate(offers_file, colnames, required_fields)

    city_data = _list_hiring_cities(offers_rows, min_creation_date,
                                    data_folder)

    # Computing the threshold per job group.
    job_group_threshold: dict[str, float] = collections.defaultdict(float)
    for job_group, offers in city_data.offers_per_job_group.items():
        job_group_threshold[job_group] = math.pow(offers, 0.6) / 40

    job_group_to_kept_cities: dict[str, list[dict[str, Any]]] = \
        collections.defaultdict(list)
    for job_group, city_ids in city_data.job_group_to_city_ids.items():
        kept_cities = []
        for city_id, offer_count in city_ids.items():
            if offer_count > job_group_threshold[job_group]:
                city_info = city_data.city_info[city_id]
                population = city_info.get('population')
                if population:
                    kept_cities.append({
                        'city':
                        city_info,
                        'offers':
                        offer_count,
                        'offersPerInhabitant':
                        offer_count / population
                    })

        job_group_to_kept_cities[job_group] = sorted(
            kept_cities,
            key=lambda k: typing.cast(float, k['offersPerInhabitant']),
            reverse=True)

    return [{
        '_id': job_group_id,
        'hiringCities': job_group_weighted_cities
    } for job_group_id, job_group_weighted_cities in
            job_group_to_kept_cities.items()]
Example #6
0
def csv2dicts(job_offers_csv, colnames_txt, last_year='2015'):
    """Import the changes of # of job offers per job group and dept in MongoDB.

    Args:
        job_offers_csv: Path of the csv containing the data.
        colnames_txt: Path to a file containing the name of the CSV's columns.
        last_year: The year to consider to compute the metrics.
    Returns:
        Evolution data as a LocalJobStats JSON-proto compatible dict.
    """

    counter = _EvolutionCounter(int(last_year))
    for job_offer in job_offers.iterate(job_offers_csv, colnames_txt,
                                        _REQUIRED_FIELDS):
        counter.collect(job_offer)
    return list(counter.get_proto_dicts())
def csv2dicts(job_offers_csv, colnames_txt):
    """Import the requirement from job offers grouped by Job Group in MongoDB.

    Args:
        job_offers_csv: Path of the csv containing the data.
        colnames_txt: Path to a file containing the name of the CSV's columns.
    Returns:
        Requirements as a JobRequirements JSON-proto compatible dict.
    """

    job_groups = collections.defaultdict(_RequirementsCollector)
    for job_offer in job_offers.iterate(job_offers_csv, colnames_txt,
                                        _REQUIRED_FIELDS):
        job_groups[job_offer.rome_profession_card_code].collect(job_offer)
    return [
        dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id)
        for job_group_id in sorted(job_groups)
    ]
def csv2dicts(job_offers_csv: str, colnames_txt: str) -> list[dict[str, Any]]:
    """Import the requirement from job offers grouped by Job Group in MongoDB.

    Args:
        job_offers_csv: Path of the csv containing the data.
        colnames_txt: Path to a file containing the name of the CSV's columns.
    Returns:
        Requirements as a JobRequirements JSON-proto compatible dict.
    """

    job_groups: dict[str, _RequirementsCollector] = collections.defaultdict(
        _RequirementsCollector)
    all_job_offers = job_offers.iterate(job_offers_csv, colnames_txt,
                                        _REQUIRED_FIELDS)
    for job_offer in tqdm.tqdm(all_job_offers, total=_TOTAL_RECORDS):
        if job_offer.rome_profession_card_code:
            job_groups[job_offer.rome_profession_card_code].collect(job_offer)
    return [
        dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id)
        for job_group_id in sorted(job_groups)
    ]