def _trim_job_offers_csv(args: argparse.Namespace, out: Optional[TextIO]) -> None: fieldnames = args.fields.split(',') all_job_offers = job_offers.iterate(args.in_csv, args.colnames_txt, required_fields=set(fieldnames + ['creation_date'])) number_offers_estimate = 8500000 with open(args.out_csv, 'w') as out_file: writer = csv.DictWriter(out_file, fieldnames=fieldnames) writer.writeheader() trim_date_fields: Set[str] = set() if args.trim_dates: trim_date_fields = { field for field in fieldnames if field.startswith('date_') or field.endswith('_date') } for job_offer in tqdm.tqdm(all_job_offers, total=number_offers_estimate, file=out): if job_offer.creation_date < args.min_creation_date: continue row = {field: getattr(job_offer, field) for field in fieldnames} for field in trim_date_fields: row[field] = row[field][:10] writer.writerow(row)
def trim_job_offers_csv(in_csv, colnames_txt, out_csv, min_creation_date='', fields=_DEFAULT_FIELDS, out=None): """Trim job offers CSV. Args: in_csv: the path of the CSV file containing all job offers in the Pôle Emploi format (using latin-1 encoding, | separators, etc). colnames_txt: the TXT file containing the list of column names. out_csv: the path where to store the output CSV file. fields: the list of fields to keep, separated by commas. """ fieldnames = fields.split(',') all_job_offers = job_offers.iterate(in_csv, colnames_txt, required_fields=set(fieldnames + ['creation_date'])) number_offers_estimate = 8500000 with open(out_csv, 'w') as out_file: writer = csv.DictWriter(out_file, fieldnames=fieldnames) writer.writeheader() for job_offer in tqdm.tqdm(all_job_offers, total=number_offers_estimate, file=out): if job_offer.creation_date < min_creation_date: continue writer.writerow( {field: getattr(job_offer, field) for field in fieldnames})
def test_missing_required_fields(self): """Test missing required field.""" offers = job_offers.iterate(path.join(self.testdata_folder, 'job_offers.csv'), path.join(self.testdata_folder, 'column_names.txt'), required_fields=set(['foobar'])) self.assertRaises(ValueError, next, offers)
def test_basic(self): """Test basic usage.""" offers = list( job_offers.iterate( path.join(self.testdata_folder, 'job_offers.csv'), path.join(self.testdata_folder, 'column_names.txt'))) # Golden values. self.assertEqual(9, len(offers)) self.assertEqual('000053Q', offers[1].id_offre) self.assertEqual('Contrat travail', offers[2].contract_nature_name)
def extract_offers_per_cities( offers_file: str, colnames: str, min_creation_date: str, data_folder: str = 'data') \ -> list[dict[str, Any]]: """Extract the interesting cities in terms of number of offers for each job group. Args: offers_file: path of cvs file with offers. colnames: the names of the columns in the offer file. min_creation_date: the date from which we consider the offers. """ required_fields = { _CITY_CODE_FIELD, _JOB_GROUP_CODE_FIELD, _LATITUDE_CODE_FIELD, _LONGITUDE_CODE_FIELD, _CITY_NAME_CODE_FIELD } offers_rows = job_offers.iterate(offers_file, colnames, required_fields) city_data = _list_hiring_cities(offers_rows, min_creation_date, data_folder) # Computing the threshold per job group. job_group_threshold: dict[str, float] = collections.defaultdict(float) for job_group, offers in city_data.offers_per_job_group.items(): job_group_threshold[job_group] = math.pow(offers, 0.6) / 40 job_group_to_kept_cities: dict[str, list[dict[str, Any]]] = \ collections.defaultdict(list) for job_group, city_ids in city_data.job_group_to_city_ids.items(): kept_cities = [] for city_id, offer_count in city_ids.items(): if offer_count > job_group_threshold[job_group]: city_info = city_data.city_info[city_id] population = city_info.get('population') if population: kept_cities.append({ 'city': city_info, 'offers': offer_count, 'offersPerInhabitant': offer_count / population }) job_group_to_kept_cities[job_group] = sorted( kept_cities, key=lambda k: typing.cast(float, k['offersPerInhabitant']), reverse=True) return [{ '_id': job_group_id, 'hiringCities': job_group_weighted_cities } for job_group_id, job_group_weighted_cities in job_group_to_kept_cities.items()]
def csv2dicts(job_offers_csv, colnames_txt, last_year='2015'): """Import the changes of # of job offers per job group and dept in MongoDB. Args: job_offers_csv: Path of the csv containing the data. colnames_txt: Path to a file containing the name of the CSV's columns. last_year: The year to consider to compute the metrics. Returns: Evolution data as a LocalJobStats JSON-proto compatible dict. """ counter = _EvolutionCounter(int(last_year)) for job_offer in job_offers.iterate(job_offers_csv, colnames_txt, _REQUIRED_FIELDS): counter.collect(job_offer) return list(counter.get_proto_dicts())
def csv2dicts(job_offers_csv, colnames_txt): """Import the requirement from job offers grouped by Job Group in MongoDB. Args: job_offers_csv: Path of the csv containing the data. colnames_txt: Path to a file containing the name of the CSV's columns. Returns: Requirements as a JobRequirements JSON-proto compatible dict. """ job_groups = collections.defaultdict(_RequirementsCollector) for job_offer in job_offers.iterate(job_offers_csv, colnames_txt, _REQUIRED_FIELDS): job_groups[job_offer.rome_profession_card_code].collect(job_offer) return [ dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id) for job_group_id in sorted(job_groups) ]
def csv2dicts(job_offers_csv: str, colnames_txt: str) -> list[dict[str, Any]]: """Import the requirement from job offers grouped by Job Group in MongoDB. Args: job_offers_csv: Path of the csv containing the data. colnames_txt: Path to a file containing the name of the CSV's columns. Returns: Requirements as a JobRequirements JSON-proto compatible dict. """ job_groups: dict[str, _RequirementsCollector] = collections.defaultdict( _RequirementsCollector) all_job_offers = job_offers.iterate(job_offers_csv, colnames_txt, _REQUIRED_FIELDS) for job_offer in tqdm.tqdm(all_job_offers, total=_TOTAL_RECORDS): if job_offer.rome_profession_card_code: job_groups[job_offer.rome_profession_card_code].collect(job_offer) return [ dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id) for job_group_id in sorted(job_groups) ]