def get_json_for_image(barcode: str, image_name: str) -> \ Optional[JSONType]: url = generate_json_ocr_url(barcode, image_name) r = http_session.get(url) if r.status_code == 404: return None return r.json()
def download_dataset(output_path: os.PathLike) -> str: r = http_session.get(settings.JSONL_DATASET_URL, stream=True) current_etag = r.headers.get("ETag", "").strip("'\"") logger.info("Dataset has changed, downloading file") logger.debug("Saving temporary file in {}".format(output_path)) with open(output_path, "wb") as f: shutil.copyfileobj(r.raw, f) return current_etag
def fetch_taxonomy(url: str, fallback_path: str, offline=False) \ -> Optional[Taxonomy]: if offline: return Taxonomy.from_json(fallback_path) try: r = http_session.get(url, timeout=5) data = r.json() except Exception: if fallback_path: return Taxonomy.from_json(fallback_path) else: return None return Taxonomy.from_dict(data)
def save_image(directory: pathlib.Path, image_meta: JSONType, barcode: str, override: bool = False): image_name = image_meta['imgid'] image_full_name = "{}_{}.jpg".format(barcode, image_name) image_path = directory / image_full_name if image_path.exists() and not override: return image_url = generate_image_url(barcode, image_name) logger.info("Downloading image {}".format(image_url)) r = http_session.get(image_url) with open(str(image_path), 'wb') as fd: logger.info("Saving image in {}".format(image_path)) for chunk in r.iter_content(chunk_size=128): fd.write(chunk)
def extract_ocr_insights(ocr_url: str, insight_types: Iterable[str]) -> JSONType: r = http_session.get(ocr_url) r.raise_for_status() ocr_data: Dict = r.json() ocr_result = ocr.OCRResult.from_json(ocr_data) if ocr_result is None: logger.info("Error during OCR extraction: {}".format(ocr_url)) return {} results = {} for insight_type in insight_types: insights = ocr.extract_insights(ocr_result, insight_type) if insights: results[insight_type] = insights return results
def fetch_images_for_ean(ean: str): url = "https://world.openfoodfacts.org/api/v0/product/" \ "{}.json?fields=images".format(ean) images = http_session.get(url).json() return images