def centre_iterator(overwrite_centers_file=True): if not MAIIA_ENABLED: return None try: center_path = MAIIA_SCRAPER.get("result_path") data_auto = get_config().get("data-auto", {}).get("base_url") url = f"{data_auto}{center_path}" response = requests.get(url) response.raise_for_status() data = response.json() if overwrite_centers_file: with open(center_path, "w") as f: f.write(json.dumps(data, indent=2)) logger.info(f"Found {len(data)} Maiia centers (external scraper).") for center in data: yield center except Exception as e: logger.warning(f"Unable to scrape Maiia centers: {e}")
from enum import Enum from typing import Optional from utils.vmd_config import get_config VACCINE_CONF = get_config().get("vaccines", {}) class Vaccine(str, Enum): PFIZER = "Pfizer-BioNTech" MODERNA = "Moderna" ASTRAZENECA = "AstraZeneca" JANSSEN = "Janssen" ARNM = "ARNm" VACCINES_NAMES = { Vaccine.PFIZER: VACCINE_CONF.get(Vaccine.PFIZER, []), Vaccine.MODERNA: VACCINE_CONF.get(Vaccine.MODERNA, []), Vaccine.ARNM: VACCINE_CONF.get(Vaccine.ARNM, []), Vaccine.ASTRAZENECA: VACCINE_CONF.get(Vaccine.ASTRAZENECA, []), Vaccine.JANSSEN: VACCINE_CONF.get(Vaccine.JANSSEN, []), } def get_vaccine_name(name: Optional[str], fallback: Optional[Vaccine] = None) -> Optional[Vaccine]: if not name: return fallback name = name.lower().strip() for vaccine, vaccine_names in VACCINES_NAMES.items(): for vaccine_name in vaccine_names:
is_category_relevant, ) from scraper.pattern.vaccine import get_vaccine_name, get_doctolib_vaccine_name, Vaccine from scraper.pattern.scraper_request import ScraperRequest from scraper.error import Blocked403, DoublonDoctolib, RequestError from utils.vmd_config import get_conf_outputs, get_conf_platform, get_config from utils.vmd_utils import DummyQueue from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache # PLATFORM MUST BE LOW, PLEASE LET THE "lower()" IN CASE OF BAD INPUT FORMAT. PLATFORM = "doctolib".lower() PLATFORM_CONF = get_conf_platform(PLATFORM) PLATFORM_ENABLED = PLATFORM_CONF.get("enabled", True) SCRAPE_ONLY_ATLAS = get_config().get("scrape_only_atlas_centers", False) NUMBER_OF_SCRAPED_DAYS = get_config().get("scrape_on_n_days", 28) PLATFORM_DAYS_PER_PAGE = PLATFORM_CONF.get("days_per_page", 7) if NUMBER_OF_SCRAPED_DAYS % PLATFORM_DAYS_PER_PAGE == 0: PLATFORM_PAGES_NUMBER = int(NUMBER_OF_SCRAPED_DAYS / PLATFORM_DAYS_PER_PAGE) else: PLATFORM_PAGES_NUMBER = (NUMBER_OF_SCRAPED_DAYS // PLATFORM_DAYS_PER_PAGE) + 1 PLATFORM_TIMEOUT = PLATFORM_CONF.get("timeout", 10) PLATFORM_REQUEST_SLEEP = PLATFORM_CONF.get("request_sleep", 0.1) timeout = httpx.Timeout(PLATFORM_TIMEOUT, connect=PLATFORM_TIMEOUT) DOCTOLIB_HEADERS = {
import logging from typing import List from urllib.parse import urlparse, urlencode, urlunparse, parse_qs, unquote import datetime as dt import pytz import requests import time from datetime import date, timedelta, datetime from pathlib import Path from unidecode import unidecode from utils.vmd_config import get_conf_inputs, get_config RESERVED_CENTERS = get_config().get("reserved_centers", []) def load_insee() -> dict: with open(get_conf_inputs().get("postalcode_to_insee")) as json_file: return json.load(json_file) def load_cedex_to_insee() -> dict: with open(get_conf_inputs().get("cedex_to_insee")) as json_file: return json.load(json_file) logger = logging.getLogger("scraper") insee = load_insee() cedex_to_insee = load_cedex_to_insee()
import json import logging from datetime import datetime import pytz import requests from pathlib import Path from stats_generation.stats_center_types import generate_stats_center_types from stats_generation.stats_map import make_maps from utils.vmd_config import get_conf_outstats, get_conf_outputs, get_config, get_conf_inputs from utils.vmd_logger import enable_logger_for_production logger = logging.getLogger("scraper") DATA_AUTO = get_config().get("base_urls").get("gitlab_public_path") def generate_stats_date(centres_stats): stats_path = get_conf_inputs().get("from_gitlab_public").get("by_date") stats_data = { "dates": [], "total_centres_disponibles": [], "total_centres": [], "total_appointments": [], } try: history_rq = requests.get(f"{DATA_AUTO}{stats_path}") data = history_rq.json() if data:
from scraper.pattern.vaccine import Vaccine, get_vaccine_name from utils.vmd_config import get_conf_platform, get_config from utils.vmd_utils import departementUtils, DummyQueue AVECMONDOC_CONF = get_conf_platform("avecmondoc") AVECMONDOC_ENABLED = AVECMONDOC_CONF.get("enabled", False) AVECMONDOC_API = AVECMONDOC_CONF.get("api", {}) AVECMONDOC_SCRAPER = AVECMONDOC_CONF.get("center_scraper", {}) AVECMONDOC_FILTERS = AVECMONDOC_CONF.get("filters", {}) AVECMONDOC_VALID_REASONS = AVECMONDOC_FILTERS.get("valid_reasons", []) AVECMONDOC_HEADERS = { "User-Agent": os.environ.get("AVECMONDOC_API_KEY", ""), } NUMBER_OF_SCRAPED_DAYS = get_config().get("scrape_on_n_days", 28) AVECMONDOC_DAYS_PER_PAGE = AVECMONDOC_CONF.get("days_per_page", 7) timeout = httpx.Timeout(AVECMONDOC_CONF.get("timeout", 25), connect=AVECMONDOC_CONF.get("timeout", 25)) DEFAULT_CLIENT = httpx.Client(headers=AVECMONDOC_HEADERS, timeout=timeout) logger = logging.getLogger("scraper") paris_tz = timezone("Europe/Paris") def search(client: httpx.Client = DEFAULT_CLIENT) -> Optional[list]: url = AVECMONDOC_API.get("search", "") limit = AVECMONDOC_API.get("search_page_size", 10) page = 1 result = {"data": [], "hasNextPage": True} while result["hasNextPage"]: payload = {"limit": limit, "page": page}
from cachecontrol.caches.file_cache import FileCache PLATFORM = "mesoigner" PLATFORM_CONF = get_conf_platform("mesoigner") PLATFORM_ENABLED = PLATFORM_CONF.get("enabled", False) MESOIGNER_HEADERS = { "Authorization": f'Mesoigner apikey="{os.environ.get("MESOIGNER_API_KEY", "")}"', } MESOIGNER_APIs = PLATFORM_CONF.get("api", "") SCRAPER_CONF = PLATFORM_CONF.get("center_scraper", {}) CENTER_LIST_URL = PLATFORM_CONF.get("api", {}).get("center_list", {}) BOOSTER_VACCINES = get_config().get("vaccines_allowed_for_booster", []) timeout = httpx.Timeout(PLATFORM_CONF.get("timeout", 30), connect=PLATFORM_CONF.get("timeout", 30)) if os.getenv("WITH_TOR", "no") == "yes": session = requests.Session() session.proxies = { # type: ignore "http": "socks5://127.0.0.1:9050", "https": "socks5://127.0.0.1:9050", } DEFAULT_CLIENT = session # type: ignore else: DEFAULT_CLIENT = httpx.Client(timeout=timeout) logger = logging.getLogger("scraper")
MAPHARMA_ENABLED = MAPHARMA_CONF.get("enabled", False) # timeout = httpx.Timeout(MAPHARMA_CONF.get("timeout", 25), connect=MAPHARMA_CONF.get("timeout", 25)) MAPARMA_REFERER = MAPHARMA_CONF.get("headers", {}).get("referer", {}) MAPHARMA_HEADERS = {"User-Agent": os.environ.get("MAPHARMA_API_KEY", ""), "Referer": MAPARMA_REFERER} MAPHARMA_FILTERS = MAPHARMA_CONF.get("filters", {}) MAPHARMA_CAMPAGNES_VALIDES = MAPHARMA_CONF.get("valid_campaigns", []) MAPHARMA_CAMPAGNES_INVALIDES = MAPHARMA_CONF.get("invalid_campaigns", []) MAPHARMA_PATHS = MAPHARMA_CONF.get("paths", {}) MAPHARMA_OPEN_DATA_FILE = Path(MAPHARMA_PATHS.get("opendata", "")) MAPHARMA_OPEN_DATA_URL = MAPHARMA_API.get("opendata", "") MAPHARMA_OPEN_DATA_URL_FALLBACK = MAPHARMA_API.get("opendata_fallback", "") NUMBER_OF_SCRAPED_DAYS = get_config().get("scrape_on_n_days", 28) BOOSTER_VACCINES = get_config().get("vaccines_allowed_for_booster", []) DEFAULT_CLIENT = httpx.Client(headers=MAPHARMA_HEADERS) logger = logging.getLogger("scraper") paris_tz = timezone("Europe/Paris") campagnes_valides = [] campagnes_inconnues = [] opendata = [] def get_possible_dose_numbers(vaccine_list: list): if not vaccine_list: return []
import pytz from utils.vmd_config import get_config from utils.vmd_utils import departementUtils from scraper.pattern.center_location import CenterLocation from scraper.pattern.scraper_result import ScraperResult from scraper.pattern.vaccine import Vaccine from utils.vmd_utils import urlify, format_phone_number from utils.vmd_logger import get_logger logger = get_logger() # Schedules array for appointments by interval INTERVAL_SPLIT_DAYS = get_config().get("appointment_split_days", []) # Array for CHRONODOSES parameters CHRONODOSE_CONF = get_config().get("chronodoses", {}) CHRONODOSES = { "Vaccine": CHRONODOSE_CONF.get("vaccine", []), "Interval": CHRONODOSE_CONF.get("interval", 0) } class CenterInfo: def __init__( self, departement: str, nom: str, url: str,
import dateutil from dateutil.tz import gettz from datetime import datetime, timedelta from typing import Iterator, Union from .resource import Resource from scraper.creneaux.creneau import Creneau, Lieu, Plateforme, PasDeCreneau from utils.vmd_config import get_config DEFAULT_NEXT_DAYS = get_config().get("scrape_on_n_days", 7) DEFAULT_TAGS = {"all": [lambda creneau: True]} class ResourceCreneauxQuotidiens(Resource): def __init__(self, departement, next_days=DEFAULT_NEXT_DAYS, now=datetime.now, tags=DEFAULT_TAGS): super().__init__() self.departement = departement self.now = now self.next_days = next_days today = now(tz=gettz("Europe/Paris")) self.dates = {} for days_from_now in range(0, next_days + 1): day = today + timedelta(days=days_from_now) date = as_date(day) self.dates[date] = ResourceCreneauxParDate(date=date, tags=tags) def on_creneau(self, creneau: Union[Creneau, PasDeCreneau]):
from scraper.circuit_breaker import ShortCircuit from scraper.creneaux.creneau import Creneau, Lieu, Plateforme, PasDeCreneau import json import requests from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache # PLATFORM MUST BE LOW, PLEASE LET THE "lower()" IN CASE OF BAD INPUT FORMAT. PLATFORM = "keldoc".lower() PLATFORM_CONF = get_conf_platform("keldoc") PLATFORM_ENABLED = PLATFORM_CONF.get("enabled", False) PLATFORM_TIMEOUT = PLATFORM_CONF.get("timeout", 25) SCRAPE_ONLY_ATLAS = get_config().get("scrape_only_atlas_centers", False) timeout = httpx.Timeout(PLATFORM_TIMEOUT, connect=PLATFORM_TIMEOUT) # change KELDOC_KILL_SWITCH to True to bypass Keldoc scraping KELDOC_HEADERS = { "User-Agent": os.environ.get("KELDOC_API_KEY", ""), } session = httpx.Client(timeout=timeout, headers=KELDOC_HEADERS) logger = logging.getLogger("scraper") # Allow 10 bad runs of keldoc_slot before giving up for the 200 next tries #@ShortCircuit("keldoc_slot", trigger=10, release=200, time_limit=40.0) #@Profiling.measure("keldoc_slot") def fetch_slots(request: ScraperRequest, creneau_q=DummyQueue()):