def center_iterator(outpath_format="data/output/{}.json") -> Iterator[dict]: url = "https://www.data.gouv.fr/fr/datasets/r/5cb21a85-b0b0-4a65-a249-806a040ec372" response = requests.get(url) response.raise_for_status() reader = io.StringIO(response.content.decode("utf8")) csvreader = csv.DictReader(reader, delimiter=";") total = 0 centres_non_pris_en_compte = {"centres_fermes": {}, "centres_urls_vides": []} for row in csvreader: row["rdv_site_web"] = fix_scrap_urls(row["rdv_site_web"]) if row["centre_fermeture"] == "t": centres_non_pris_en_compte["centres_fermes"][row["gid"]] = row["rdv_site_web"] if should_use_opendata_csv(row["rdv_site_web"]): yield row else: centres_non_pris_en_compte["centres_urls_vides"].append(row["gid"]) total += 1 nb_fermes = len(centres_non_pris_en_compte["centres_fermes"]) nb_urls_vides = len(centres_non_pris_en_compte["centres_urls_vides"]) logger.info(f"Il y a {nb_fermes} centres fermes dans le fichier gouv sur un total de {total}") nb_urls_vides = len(centres_non_pris_en_compte["centres_urls_vides"]) logger.info(f"Il y a {nb_urls_vides} centres avec une URL vide dans le fichier gouv sur un total de {total}") outpath = outpath_format.format("centres_non_pris_en_compte_gouv") with open(outpath, "w") as fichier: json.dump(centres_non_pris_en_compte, fichier, indent=2)
def fetch_centre_slots(rdv_site_web, start_date, fetch_map: dict = None): if fetch_map is None: # Map platform to implementation. # May be overridden for unit testing purposes. fetch_map = { 'Doctolib': { 'urls': ['https://partners.doctolib.fr', 'https://www.doctolib.fr'], 'scraper_ptr': doctolib_fetch_slots }, 'Keldoc': { 'urls': ['https://vaccination-covid.keldoc.com', 'https://keldoc.com'], 'scraper_ptr': keldoc_fetch_slots }, 'Maiia': { 'urls': ['https://www.maiia.com'], 'scraper_ptr': maiia_fetch_slots }, 'Mapharma': { 'urls': [ 'https://mapharma.net/', ], 'scraper_ptr': mapharma_fetch_slots }, 'Ordoclic': { 'urls': [ 'https://app.ordoclic.fr/', ], 'scraper_ptr': ordoclic_fetch_slots } } rdv_site_web = fix_scrap_urls(rdv_site_web) request = ScraperRequest(rdv_site_web, start_date) # Determine platform based on visit URL platform = None for scraper_name in fetch_map: scraper = fetch_map[scraper_name] scrap = sum([ 1 if rdv_site_web.startswith(url) else 0 for url in scraper.get('urls', []) ]) if scrap == 0: continue platform = scraper_name if not platform: return ScraperResult(request, 'Autre', None) # Dispatch to appropriate implementation. fetch_impl = fetch_map[platform]['scraper_ptr'] result = ScraperResult(request, platform, None) result.next_availability = fetch_impl(request) return result
def scrape_debug(urls): enable_logger_for_debug() start_date = get_start_date() for rdv_site_web in urls: rdv_site_web = fix_scrap_urls(rdv_site_web) logger.info('scraping URL %s', rdv_site_web) try: result = fetch_centre_slots(rdv_site_web, start_date) except Exception as e: logger.exception(f"erreur lors du traitement") logger.info( f'{result.platform!s:16} {result.next_availability or ""!s:32}')
def scrape_debug(urls): # pragma: no cover enable_logger_for_debug() start_date = get_start_date() for rdv_site_web in urls: rdv_site_web = fix_scrap_urls(rdv_site_web) logger.info("scraping URL %s", rdv_site_web) try: result = fetch_centre_slots(rdv_site_web, start_date) except Exception: logger.exception(f"erreur lors du traitement") logger.info( f'{result.platform!s:16} {result.next_availability or ""!s:32}') if result.request.appointment_count: logger.debug(f"appointments: {result.request.appointment_count}") log_requests(result.request)
def fetch_centre_slots(rdv_site_web, start_date, fetch_map: dict = None): if fetch_map is None: # Map platform to implementation. # May be overridden for unit testing purposes. fetch_map = get_default_fetch_map() rdv_site_web = fix_scrap_urls(rdv_site_web) request = ScraperRequest(rdv_site_web, start_date) platform = get_center_platform(rdv_site_web, fetch_map=fetch_map) if not platform: return ScraperResult(request, "Autre", None) # Dispatch to appropriate implementation. fetch_impl = fetch_map[platform]["scraper_ptr"] result = ScraperResult(request, platform, None) result.next_availability = fetch_impl(request) return result
def fetch_centre_slots( rdv_site_web, center_platform, start_date, creneau_q, center_info, fetch_map: dict = None, input_data: dict = None, atlas_gid=None, ) -> ScraperResult: practitioner_type = None internal_id = None if fetch_map is None: # Map platform to implementation. # May be overridden for unit testing purposes. fetch_map = get_default_fetch_map() if center_info.type: practitioner_type = center_info.type if center_info.internal_id: internal_id = center_info.internal_id rdv_site_web = fix_scrap_urls(rdv_site_web) request = ScraperRequest( rdv_site_web, start_date, center_info, internal_id=internal_id, practitioner_type=practitioner_type, atlas_gid=atlas_gid, ) platform = get_center_platform(rdv_site_web, center_platform, fetch_map=fetch_map) if not platform: return ScraperResult(request, "Autre", None) if input_data: request.input_data = input_data # Dispatch to appropriate implementation. fetch_impl = fetch_map[platform]["scraper_ptr"] result = ScraperResult(request, platform, None) result.next_availability = fetch_impl(request, creneau_q) return result