from pymongo import MongoClient from robotoff.utils.cache import CachedStore from robotoff import settings def get_mongo_client() -> MongoClient: return MongoClient(settings.MONGO_URI) MONGO_CLIENT_CACHE = CachedStore(get_mongo_client, expiration_interval=None)
for (label_tag, label), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] insights.append({ "label_tag": label_tag, "text": match_str, "data_source": "flashtext", "notify": False, }) return insights LOGO_ANNOTATION_LABELS: Dict[str, str] = get_logo_annotation_labels() LABEL_KEYWORD_PROCESSOR_STORE = CachedStore( fetch_func=generate_label_keyword_processor, expiration_interval=None) def find_labels(content: Union[OCRResult, str]) -> List[Dict]: insights = [] for label_tag, regex_list in LABELS_REGEX.items(): for ocr_regex in regex_list: text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func: label_value = ocr_regex.processing_func(match)
def in_barcode_range(brand_prefix: Set[Tuple[str, str]], brand_tag: str, barcode: str) -> bool: """Check that the insight barcode is in the range of the detected brand barcode range. Return True if the check passes, False otherwise """ if len(barcode) == 13: barcode_prefix = generate_barcode_prefix(barcode) key = (brand_tag, barcode_prefix) if key not in brand_prefix: return False return True BRAND_PREFIX_STORE = CachedStore(fetch_func=get_brand_prefix, expiration_interval=None) BRAND_BLACKLIST_STORE = CachedStore(fetch_func=get_brand_blacklist, expiration_interval=None) if __name__ == "__main__": blacklisted_brands = get_brand_blacklist() dump_taxonomy_brands( threshold=settings.BRAND_MATCHING_MIN_COUNT, min_length=settings.BRAND_MATCHING_MIN_LENGTH, blacklisted_brands=blacklisted_brands, )
for predictions, product in zip(predictions_batch, product_batch): for insight in format_predictions(product, predictions, "xx"): yield insight def format_predictions(product: Dict, predictions: List[CategoryPrediction], lang: str) -> List[Dict]: formatted_predictions = [] for category, confidence in predictions: formatted = { "barcode": product["code"], "category": category, "lang": lang, "model": "neural", "confidence": confidence, } formatted_predictions.append(formatted) return formatted_predictions def filter_blacklisted_categories( predictions: List[CategoryPrediction], ) -> List[CategoryPrediction]: category_blacklist: Set[str] = CATEGORY_BLACKLIST_STORE.get() return [(category, confidence) for (category, confidence) in predictions if category not in category_blacklist] CATEGORY_BLACKLIST_STORE = CachedStore(load_category_blacklist)
for (key, _), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.packager_code, value=key, predictor="flashtext", data={ "type": "fishing", "raw": match_str, "notify": False }, automatic_processing=True, )) return insights FISHING_KEYWORD_PROCESSOR_STORE = CachedStore( fetch_func=generate_fishing_code_keyword_processor, expiration_interval=None) def find_packager_codes(ocr_result: Union[OCRResult, str]) -> List[RawInsight]: insights = find_packager_codes_regex(ocr_result) processor = FISHING_KEYWORD_PROCESSOR_STORE.get() text = get_text(ocr_result) insights += extract_fishing_code(processor, text) return insights
def __getitem__(self, barcode: str) -> Optional[Product]: product = self.get_product(barcode) if product: return Product(product) return None def __iter__(self): raise NotImplementedError("cannot iterate over database product store") def load_min_dataset() -> ProductStore: ps = MemoryProductStore.load_min() logger.info("product store loaded ({} items)".format(len(ps))) return ps def get_product_store() -> DBProductStore: mongo_client = MONGO_CLIENT_CACHE.get() return DBProductStore(client=mongo_client) def get_product(barcode: str, projection: Optional[List[str]] = None) -> Optional[JSONType]: mongo_client = MONGO_CLIENT_CACHE.get() return mongo_client.off.products.find_one({"code": barcode}, projection) CACHED_PRODUCT_STORE = CachedStore(load_min_dataset)
data = r.json() except Exception as e: logger.exception(f"{type(e)} exception while fetching taxonomy at %s", url) if fallback_path: return Taxonomy.from_json(fallback_path) else: return None return Taxonomy.from_dict(data) TAXONOMY_STORES: Dict[str, CachedStore] = { TaxonomyType.category.name: CachedStore( functools.partial( fetch_taxonomy, url=settings.TAXONOMY_CATEGORY_URL, fallback_path=settings.TAXONOMY_CATEGORY_PATH, ) ), TaxonomyType.ingredient.name: CachedStore( functools.partial( fetch_taxonomy, url=settings.TAXONOMY_INGREDIENT_URL, fallback_path=settings.TAXONOMY_INGREDIENT_PATH, ) ), TaxonomyType.label.name: CachedStore( functools.partial( fetch_taxonomy, url=settings.TAXONOMY_LABEL_URL, fallback_path=settings.TAXONOMY_LABEL_PATH,
try: r = requests.get(url, timeout=5) data = r.json() except Exception: if fallback_path: return Taxonomy.from_json(fallback_path) else: return None return Taxonomy.from_dict(data) TAXONOMY_STORES: Dict[str, CachedStore] = { TaxonomyType.category.name: CachedStore( functools.partial(fetch_taxonomy, url=settings.TAXONOMY_CATEGORY_URL, fallback_path=settings.TAXONOMY_CATEGORY_PATH)), TaxonomyType.ingredient.name: CachedStore( functools.partial(fetch_taxonomy, url=settings.TAXONOMY_INGREDIENT_URL, fallback_path=settings.TAXONOMY_INGREDIENT_PATH)), TaxonomyType.label.name: CachedStore( functools.partial(fetch_taxonomy, url=settings.TAXONOMY_LABEL_URL, fallback_path=settings.TAXONOMY_LABEL_PATH)) }
from robotoff.insights.ocr.utils import generate_keyword_processor from robotoff.insights.ocr.utils import get_tag from robotoff.utils import text_file_iter from robotoff.utils.cache import CachedStore def generate_packaging_keyword_processor( packaging: Optional[List[str]] = None): if packaging is None: packaging = text_file_iter(settings.OCR_PACKAGING_DATA_PATH) return generate_keyword_processor(packaging) KEYWORD_PROCESSOR_STORE = CachedStore( fetch_func=generate_packaging_keyword_processor, expiration_interval=None) def find_packaging(content: Union[OCRResult, str]) -> List[Dict]: insights = [] text = get_text(content) if not text: return [] processor = KEYWORD_PROCESSOR_STORE.get() for (packaging_str, _), span_start, span_end in processor.extract_keywords( text, span_info=True): packagings = packaging_str.split(";")
from robotoff.models import batch_insert, ProductInsight from robotoff.off import get_server_type from robotoff.products import get_product_store, is_valid_image, Product, ProductStore from robotoff.taxonomy import get_taxonomy, Taxonomy, TaxonomyNode from robotoff.utils import get_logger, text_file_iter from robotoff.utils.cache import CachedStore from robotoff.utils.types import JSONType logger = get_logger(__name__) def load_authorized_labels() -> Set[str]: return set(text_file_iter(settings.OCR_LABEL_WHITELIST_DATA_PATH)) AUTHORIZED_LABELS_STORE = CachedStore(load_authorized_labels, expiration_interval=None) def generate_seen_set_query(insight_type: InsightType, barcode: str, server_domain: str): return ProductInsight.select( ProductInsight.value, ProductInsight.value_tag).where( ProductInsight.type == insight_type.name, ProductInsight.latent == False, # noqa: E712 ProductInsight.barcode == barcode, ProductInsight.server_domain == server_domain, ) def is_reserved_barcode(barcode: str) -> bool: if barcode.startswith("0"):
return None pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)".format(city.postal_code) sub_start = max(0, city_start - self.postal_code_search_distance) sub_end = min(len(text), city_end + self.postal_code_search_distance) sub_text = text[sub_start:sub_end] match = re.search(pattern, sub_text) if match is None: return None else: return match.group( 1), sub_start + match.start(1), sub_start + match.end(1) ADDRESS_EXTRACTOR_STORE = CachedStore( lambda: AddressExtractor(load_cities_fr()), expiration_interval=None) def find_locations(content: Union[OCRResult, str]) -> List[RawInsight]: """Find location insights in the text content. See :class:`.AddressExtractor`. Args: content (OCRResult or str): The content to be searched for locations. Returns: list of RawInsight: See :meth:`.AddressExtractor.extract_addresses`. """ location_extractor: AddressExtractor = ADDRESS_EXTRACTOR_STORE.get() return location_extractor.extract_addresses(content)
logger.info("Loading product store") ds = ProductDataset(path) stream = ds.stream() seen = set() for product in stream.iter_product(): if product.barcode: seen.add(product.barcode) self.store[product.barcode] = product if reset: for key in set(self.store.keys()).difference(seen): self.store.pop(key) logger.info("product store loaded ({} items added)".format(len(seen))) @classmethod def load_from_min_dataset(cls): product_store = ProductStore() product_store.load(settings.JSONL_MIN_DATASET_PATH, False) return product_store def __getitem__(self, item) -> Optional[Product]: return self.store.get(item) def __iter__(self) -> Iterable[Product]: return iter(self.store.values()) CACHED_PRODUCT_STORE = CachedStore(lambda: ProductStore.load_from_min_dataset())
def get_deaccent_cache(voc_cache: CachedStore) -> CachedStore: return CachedStore(Vocabulary.deaccent_tokens_fn(voc_cache.get()))
def get_voc_cache(path: Path) -> CachedStore: return CachedStore(Vocabulary.load_vocabulary_fn(path))
if added >= k: break return predictions class FastTextLanguageIdentifier(LanguageIdentifier): def __init__(self, model): self.model = model def predict(self, text: str, k: int = 10, threshold: float = 0.0) -> List[LanguagePrediction]: predictions: List[LanguagePrediction] = [] languages, confidences = self.model.predict(text, k=k, threshold=threshold) for language, confidence in zip(languages, confidences): # language str format is __label__en language = language[9:] prediction = LanguagePrediction(language, confidence) predictions.append(prediction) return predictions DEFAULT_LANGUAGE_IDENTIFIER = CachedStore(LangidLanguageIdentifier.load)
} LogoLabelType = Tuple[str, Optional[str]] UNKNOWN_LABEL: LogoLabelType = ("UNKNOWN", None) def get_logo_confidence_thresholds() -> Dict[LogoLabelType, float]: thresholds = {} for item in LogoConfidenceThreshold.select().iterator(): thresholds[(item.type, item.value)] = item.threshold return thresholds LOGO_CONFIDENCE_THRESHOLDS = CachedStore(get_logo_confidence_thresholds, expiration_interval=10) def get_stored_logo_ids() -> Set[int]: r = http_session.get( "https://robotoff.openfoodfacts.org/api/v1/ann/stored", timeout=30) if not r.ok: logger.warning( f"error while fetching stored logo IDs ({r.status_code}): {r.text}" ) return set() return set(r.json()["stored"])