def find_labels(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] for label_tag, regex_list in LABELS_REGEX.items(): for ocr_regex in regex_list: text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func: label_value = ocr_regex.processing_func(match) if label_value is None: continue else: label_value = label_tag insights.append( RawInsight( type=InsightType.label, value_tag=label_value, predictor="regex", data={ "text": match.group(), "notify": ocr_regex.notify }, )) processor = LABEL_KEYWORD_PROCESSOR_STORE.get() text = get_text(content) insights += extract_label_flashtext(processor, text) if isinstance(content, OCRResult): for logo_annotation in content.logo_annotations: if logo_annotation.description in LOGO_ANNOTATION_LABELS: label_tag = LOGO_ANNOTATION_LABELS[logo_annotation.description] insights.append( RawInsight( type=InsightType.label, value_tag=label_tag, automatic_processing=False, predictor="google-cloud-vision", data={"confidence": logo_annotation.score}, )) return insights
def find_nutrient_mentions(content: Union[OCRResult, str]) -> List[RawInsight]: nutrients: JSONType = {} for regex_code, ocr_regex in NUTRIENT_MENTIONS_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): nutrients.setdefault(regex_code, []) group_dict = {k: v for k, v in match.groupdict().items() if v is not None} languages: List[str] = [] if group_dict: languages_raw = list(group_dict.keys())[0] languages = languages_raw.rsplit("_", maxsplit=1)[0].split("_") nutrients[regex_code].append( { "raw": match.group(0), "span": list(match.span()), "languages": languages, } ) if not nutrients: return [] return [ RawInsight( type=InsightType.nutrient_mention, data={"mentions": nutrients, "version": EXTRACTOR_VERSION}, ) ]
def find_nutrient_values(content: Union[OCRResult, str]) -> List[RawInsight]: nutrients: JSONType = {} for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): value = match.group(2).replace(",", ".") unit = match.group(3) nutrients.setdefault(regex_code, []) nutrients[regex_code].append( { "raw": match.group(0), "nutrient": regex_code, "value": value, "unit": unit, } ) if not nutrients: return [] return [ RawInsight( type=InsightType.nutrient, data={"nutrients": nutrients, "version": EXTRACTOR_VERSION}, ) ]
def find_packaging(content: Union[OCRResult, str]) -> List[Dict]: insights = [] text = get_text(content) if not text: return [] processor = KEYWORD_PROCESSOR_STORE.get() for (packaging_str, _), span_start, span_end in processor.extract_keywords( text, span_info=True): packagings = packaging_str.split(";") for packaging in packagings: match_str = text[span_start:span_end] insights.append({ "packaging_tag": get_tag(packaging), "packaging": packaging, "text": match_str, "notify": True, "automatic_processing": True, }) return insights
def find_packager_codes_regex( ocr_result: Union[OCRResult, str]) -> List[RawInsight]: results: List[RawInsight] = [] for regex_code, ocr_regex in PACKAGER_CODE.items(): text = get_text(ocr_result, ocr_regex, ocr_regex.lowercase) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: value = match.group(0) else: value = ocr_regex.processing_func(match) results.append( RawInsight( value=value, data={ "raw": match.group(0), "type": regex_code, "notify": ocr_regex.notify, }, type=InsightType.packager_code, automatic_processing=True, )) return results
def find_nutrient_values(content: Union[OCRResult, str]) -> List[Dict]: nutrients: JSONType = {} for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): value = match.group(2).replace(",", ".") unit = match.group(3) nutrients.setdefault(regex_code, []) nutrients[regex_code].append({ "raw": match.group(0), "nutrient": regex_code, "value": value, "unit": unit, }) if not nutrients: return [] return [{ "nutrients": nutrients, "notify": False, }]
def predict_ocr_categories(content: Union[OCRResult, str]) -> List[RawInsight]: """Run prediction on a given OCR and return insights. If the model hesitates between 2 categories, both are returned as insights. Otherwise, only 1 category is returned. We consider the model to be "hesitating" if the probability of the top 2 categories are separated by less than `HESITATION_THRESHOLD` percent. """ text = get_text(content) if not text: return [] probabilities = Predictor(text=text).run() indices_max = np.argsort(probabilities) # Select top 2 categories best_index = indices_max[-1] best_proba = probabilities[best_index] second_index = indices_max[-2] second_proba = probabilities[second_index] # Return either top category only or both, depending on the gap results = [_get_raw_insight(best_proba, best_index)] if (best_proba - second_proba) <= HESITATION_THRESHOLD: results.append(_get_raw_insight(second_proba, second_index)) return results
def find_traces(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] text = get_text(content, TRACES_REGEX) if not text: return [] processor = TRACE_KEYWORD_PROCESSOR_STORE.get() for match in TRACES_REGEX.regex.finditer(text): prompt = match.group() end_idx = match.end() captured = text[end_idx : end_idx + 100] for (trace_tag, _), span_start, span_end in processor.extract_keywords( captured, span_info=True ): match_str = captured[span_start:span_end] insights.append( RawInsight( type=InsightType.trace, value_tag=trace_tag, data={"text": match_str, "prompt": prompt, "notify": False}, ) ) return insights
def find_stores(content: Union[OCRResult, str]) -> List[RawInsight]: results = [] text = get_text(content, STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups() for idx, match_str in enumerate(groups): if match_str is not None: store, _ = SORTED_STORES[idx] results.append( RawInsight( type=InsightType.store, value=store, value_tag=get_store_tag(store), data={ "text": match_str, "notify": store in NOTIFY_STORES }, )) break return results
def find_product_weight(content: Union[OCRResult, str]) -> List[RawInsight]: results = [] for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: continue result = ocr_regex.processing_func(match) if result is None: continue result["matcher_type"] = type_ result["priority"] = ocr_regex.priority result["notify"] = ocr_regex.notify value = result.pop("text") automatic_processing = result.pop("automatic_processing", None) results.append( RawInsight( value=value, type=InsightType.product_weight, automatic_processing=automatic_processing, data=result, )) return results
def find_expiration_date(content: Union[OCRResult, str]) -> List[Dict]: # Parse expiration date # "À consommer de préférence avant", results: List[Dict] = [] for type_, ocr_regex in EXPIRATION_DATE_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): raw = match.group(0) if not ocr_regex.processing_func: continue date = ocr_regex.processing_func(match) if date is None: continue if date.year > 2025 or date.year < 2015: continue # Format dates according to ISO 8601 value = date.strftime("%Y-%m-%d") results.append( {"raw": raw, "text": value, "type": type_, "notify": ocr_regex.notify,} ) return results
def find_packaging(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] text = get_text(content) if not text: return [] processor = KEYWORD_PROCESSOR_STORE.get() for (packaging_str, _), span_start, span_end in processor.extract_keywords( text, span_info=True ): packagings = packaging_str.split(";") for packaging in packagings: match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.packaging, value_tag=get_tag(packaging), value=packaging, data={"text": match_str, "notify": False}, automatic_processing=True, ) ) return insights
def find_brands(content: Union[OCRResult, str]) -> List[RawInsight]: insights: List[RawInsight] = [] text = get_text(content) if text: insights += extract_brands(BRAND_PROCESSOR, text, "curated-list") insights += extract_brands(TAXONOMY_BRAND_PROCESSOR, text, "taxonomy") if isinstance(content, OCRResult): insights += extract_brands_google_cloud_vision(content) return insights
def flag_image(content: Union[OCRResult, str]) -> List[RawInsight]: insights: List[RawInsight] = [] text = get_text(content) insight = extract_image_flag_flashtext(PROCESSOR, text) if insight is not None: insights.append(insight) if isinstance(content, str): return insights safe_search_annotation = content.get_safe_search_annotation() label_annotations = content.get_label_annotations() if safe_search_annotation: for key in ("adult", "violence"): value: SafeSearchAnnotationLikelihood = getattr(safe_search_annotation, key) if value >= SafeSearchAnnotationLikelihood.VERY_LIKELY: insights.append( RawInsight( type=InsightType.image_flag, data={ "type": "safe_search_annotation", "label": key, "likelihood": value.name, }, ) ) for label_annotation in label_annotations: if ( label_annotation.description in LABELS_TO_FLAG and label_annotation.score >= 0.6 ): insights.append( RawInsight( type=InsightType.image_flag, data={ "type": "label_annotation", "label": label_annotation.description.lower(), "likelihood": label_annotation.score, }, ) ) break return insights
def find_packager_codes(ocr_result: Union[OCRResult, str]) -> List[Dict]: results: List[Dict] = [] for regex_code, ocr_regex in PACKAGER_CODE.items(): text = get_text(ocr_result, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is not None: value = ocr_regex.processing_func(match) results.append( { "raw": match.group(0), "text": value, "type": regex_code, "notify": ocr_regex.notify, } ) return results
def find_stores(content: Union[OCRResult, str]) -> List[Dict]: results = [] text = get_text(content, STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups() for idx, match_str in enumerate(groups): if match_str is not None: store, _ = SORTED_STORES[idx] results.append({ "value": store, "value_tag": get_store_tag(store), "text": match_str, "notify": store in NOTIFY_STORES, }) break return results
def find_product_weight(content: Union[OCRResult, str]) -> List[Dict]: results = [] for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: continue result = ocr_regex.processing_func(match) if result is None: continue result["matcher_type"] = type_ result["priority"] = ocr_regex.priority result["notify"] = ocr_regex.notify results.append(result) return results
def find_packager_codes(ocr_result: Union[OCRResult, str]) -> List[RawInsight]: insights = find_packager_codes_regex(ocr_result) processor = FISHING_KEYWORD_PROCESSOR_STORE.get() text = get_text(ocr_result) insights += extract_fishing_code(processor, text) return insights