def test_product_insights_merge(): insights_1 = [ RawInsight(type=InsightType.label, data={}, value_tag="en:organic") ] product_insights_1 = ProductInsights( insights=insights_1, barcode="123", type=InsightType.label, source_image="/123/1.jpg", ) insights_2 = [ RawInsight(type=InsightType.label, data={}, value_tag="en:pgi") ] product_insights_2 = ProductInsights( insights=insights_2, barcode="123", type=InsightType.label, source_image="/123/1.jpg", ) merged_product_insights = ProductInsights.merge( [product_insights_1, product_insights_2]) assert merged_product_insights.type == InsightType.label assert merged_product_insights.barcode == "123" assert merged_product_insights.source_image == "/123/1.jpg" assert merged_product_insights.insights == insights_1 + insights_2
def find_labels(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] for label_tag, regex_list in LABELS_REGEX.items(): for ocr_regex in regex_list: text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func: label_value = ocr_regex.processing_func(match) if label_value is None: continue else: label_value = label_tag insights.append( RawInsight( type=InsightType.label, value_tag=label_value, predictor="regex", data={ "text": match.group(), "notify": ocr_regex.notify }, )) processor = LABEL_KEYWORD_PROCESSOR_STORE.get() text = get_text(content) insights += extract_label_flashtext(processor, text) if isinstance(content, OCRResult): for logo_annotation in content.logo_annotations: if logo_annotation.description in LOGO_ANNOTATION_LABELS: label_tag = LOGO_ANNOTATION_LABELS[logo_annotation.description] insights.append( RawInsight( type=InsightType.label, value_tag=label_tag, automatic_processing=False, predictor="google-cloud-vision", data={"confidence": logo_annotation.score}, )) return insights
def flag_image(content: Union[OCRResult, str]) -> List[RawInsight]: insights: List[RawInsight] = [] text = get_text(content) insight = extract_image_flag_flashtext(PROCESSOR, text) if insight is not None: insights.append(insight) if isinstance(content, str): return insights safe_search_annotation = content.get_safe_search_annotation() label_annotations = content.get_label_annotations() if safe_search_annotation: for key in ("adult", "violence"): value: SafeSearchAnnotationLikelihood = getattr(safe_search_annotation, key) if value >= SafeSearchAnnotationLikelihood.VERY_LIKELY: insights.append( RawInsight( type=InsightType.image_flag, data={ "type": "safe_search_annotation", "label": key, "likelihood": value.name, }, ) ) for label_annotation in label_annotations: if ( label_annotation.description in LABELS_TO_FLAG and label_annotation.score >= 0.6 ): insights.append( RawInsight( type=InsightType.image_flag, data={ "type": "label_annotation", "label": label_annotation.description.lower(), "likelihood": label_annotation.score, }, ) ) break return insights
def find_product_weight(content: Union[OCRResult, str]) -> List[RawInsight]: results = [] for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: continue result = ocr_regex.processing_func(match) if result is None: continue result["matcher_type"] = type_ result["priority"] = ocr_regex.priority result["notify"] = ocr_regex.notify value = result.pop("text") automatic_processing = result.pop("automatic_processing", None) results.append( RawInsight( value=value, type=InsightType.product_weight, automatic_processing=automatic_processing, data=result, )) return results
def generate_raw_insight(logo_type: str, logo_value: Optional[str], **kwargs) -> Optional[RawInsight]: if logo_type not in LOGO_TYPE_MAPPING: return None insight_type = LOGO_TYPE_MAPPING[logo_type] value_tag = None value = None if insight_type == InsightType.brand: value = logo_value if value is None: return None elif insight_type == InsightType.label: value_tag = logo_value if value_tag is None: return None return RawInsight( type=insight_type, value_tag=value_tag, value=value, automatic_processing=False, predictor="universal-logo-detector", data=kwargs, )
def find_packaging(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] text = get_text(content) if not text: return [] processor = KEYWORD_PROCESSOR_STORE.get() for (packaging_str, _), span_start, span_end in processor.extract_keywords( text, span_info=True ): packagings = packaging_str.split(";") for packaging in packagings: match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.packaging, value_tag=get_tag(packaging), value=packaging, data={"text": match_str, "notify": False}, automatic_processing=True, ) ) return insights
def find_packager_codes_regex( ocr_result: Union[OCRResult, str]) -> List[RawInsight]: results: List[RawInsight] = [] for regex_code, ocr_regex in PACKAGER_CODE.items(): text = get_text(ocr_result, ocr_regex, ocr_regex.lowercase) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: value = match.group(0) else: value = ocr_regex.processing_func(match) results.append( RawInsight( value=value, data={ "raw": match.group(0), "type": regex_code, "notify": ocr_regex.notify, }, type=InsightType.packager_code, automatic_processing=True, )) return results
def find_nutrient_mentions(content: Union[OCRResult, str]) -> List[RawInsight]: nutrients: JSONType = {} for regex_code, ocr_regex in NUTRIENT_MENTIONS_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): nutrients.setdefault(regex_code, []) group_dict = {k: v for k, v in match.groupdict().items() if v is not None} languages: List[str] = [] if group_dict: languages_raw = list(group_dict.keys())[0] languages = languages_raw.rsplit("_", maxsplit=1)[0].split("_") nutrients[regex_code].append( { "raw": match.group(0), "span": list(match.span()), "languages": languages, } ) if not nutrients: return [] return [ RawInsight( type=InsightType.nutrient_mention, data={"mentions": nutrients, "version": EXTRACTOR_VERSION}, ) ]
def find_traces(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] text = get_text(content, TRACES_REGEX) if not text: return [] processor = TRACE_KEYWORD_PROCESSOR_STORE.get() for match in TRACES_REGEX.regex.finditer(text): prompt = match.group() end_idx = match.end() captured = text[end_idx : end_idx + 100] for (trace_tag, _), span_start, span_end in processor.extract_keywords( captured, span_info=True ): match_str = captured[span_start:span_end] insights.append( RawInsight( type=InsightType.trace, value_tag=trace_tag, data={"text": match_str, "prompt": prompt, "notify": False}, ) ) return insights
def find_nutrient_values(content: Union[OCRResult, str]) -> List[RawInsight]: nutrients: JSONType = {} for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): value = match.group(2).replace(",", ".") unit = match.group(3) nutrients.setdefault(regex_code, []) nutrients[regex_code].append( { "raw": match.group(0), "nutrient": regex_code, "value": value, "unit": unit, } ) if not nutrients: return [] return [ RawInsight( type=InsightType.nutrient, data={"nutrients": nutrients, "version": EXTRACTOR_VERSION}, ) ]
def extract_nutriscore_label( image: Image.Image, manual_threshold: float, automatic_threshold: float) -> Optional[RawInsight]: model = ObjectDetectionModelRegistry.get("nutriscore") raw_result = model.detect_from_image(image, output_image=False) results = raw_result.select(threshold=manual_threshold) if not results: return None if len(results) > 1: logger.warn("more than one nutriscore detected, discarding detections") return None result = results[0] score = result.score automatic_processing = score >= automatic_threshold label_tag = NUTRISCORE_LABELS[result.label] return RawInsight( type=InsightType.label, value_tag=label_tag, automatic_processing=automatic_processing, data={ "confidence": score, "bounding_box": result.bounding_box, "model": "nutriscore", "notify": True, }, )
def find_stores(content: Union[OCRResult, str]) -> List[RawInsight]: results = [] text = get_text(content, STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups() for idx, match_str in enumerate(groups): if match_str is not None: store, _ = SORTED_STORES[idx] results.append( RawInsight( type=InsightType.store, value=store, value_tag=get_store_tag(store), data={ "text": match_str, "notify": store in NOTIFY_STORES }, )) break return results
def _get_raw_insight(probabilily: float, index: int) -> RawInsight: return RawInsight( type=InsightType.category, value_tag=LIST_CATEGORIES[index], data={ "confidence": round(probabilily, 4), }, predictor="ridge_model-ml", )
def extract_image_flag_flashtext( processor: KeywordProcessor, text: str ) -> Optional[RawInsight]: for (_, key), span_start, span_end in processor.extract_keywords( text, span_info=True ): match_str = text[span_start:span_end] return RawInsight( type=InsightType.image_flag, data={"text": match_str, "type": "text", "label": key}, ) return None
def find_image_orientation( ocr_result: Union[OCRResult, str]) -> List[RawInsight]: if isinstance(ocr_result, str): return [] orientation_result = ocr_result.get_orientation() if orientation_result is None: return [] insight = orientation_result.to_json() insight["rotation"] = get_rotation_angle_from_orientation( orientation_result.orientation) return [RawInsight(type=InsightType.image_orientation, data=insight)]
def extract_addresses(self, content: Union[str, OCRResult]) -> List[RawInsight]: """Extract addresses from the given OCR result. Args: content (OCRResult or str): a string or the OCR result to process. Returns: list of RawInsight: List of addresses extracted from the text. Each entry is a dictionary with the items: country_code (always "fr"), city_name, postal_code and text_extract. """ if isinstance(content, OCRResult): text = self.get_text(content) else: text = content text = self.normalize_text(text) city_matches = self.find_city_names(text) locations = [] for city, city_start, city_end in city_matches: pc_match = self.find_nearby_postal_code(text, city, city_start, city_end) if pc_match is None: continue pc, pc_start, pc_end = pc_match address_start = min(city_start, pc_start) - self.text_extract_distance address_end = max(city_end, pc_end) + self.text_extract_distance text_extract = text[max(0, address_start ):min(len(text), address_end)] locations.append( RawInsight( type=InsightType.location, data={ "country_code": "fr", "city_name": city.name, "postal_code": city.postal_code, "text_extract": text_extract, }, )) return locations
def extract_label_flashtext(processor: KeywordProcessor, text: str) -> List[RawInsight]: insights = [] for (label_tag, _), span_start, span_end in processor.extract_keywords( text, span_info=True ): match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.label, value_tag=label_tag, automatic_processing=False, predictor="flashtext", data={"text": match_str, "notify": False}, ) ) return insights
def extract_brands_google_cloud_vision(ocr_result: OCRResult) -> List[RawInsight]: insights = [] for logo_annotation in ocr_result.logo_annotations: if logo_annotation.description in LOGO_ANNOTATION_BRANDS: brand = LOGO_ANNOTATION_BRANDS[logo_annotation.description] insights.append( RawInsight( type=InsightType.brand, value=brand, value_tag=get_tag(brand), automatic_processing=False, predictor="google-cloud-vision", data={"confidence": logo_annotation.score, "notify": False}, ) ) return insights
def predict(client, product: Dict) -> Optional[ProductInsights]: predictions = [] for lang in product.get("languages_codes", []): product_name = product.get("product_name_{}".format(lang)) if not product_name: continue prediction = predict_category(client, product_name, lang) if prediction is None: continue category, score = prediction predictions.append((lang, category, product_name, score)) continue if predictions: # Sort by descending score sorted_predictions = sorted( predictions, key=operator.itemgetter(2), reverse=True ) p = sorted_predictions[0] lang, category, product_name, score = p return ProductInsights( barcode=product["code"], type=InsightType.category, insights=[ RawInsight( type=InsightType.category, value_tag=category, data={ "lang": lang, "product_name": product_name, "model": "matcher", }, ) ], ) return None
def format_predictions(product: Dict, predictions: List[CategoryPrediction], lang: str) -> ProductInsights: insights = [] for category, confidence in predictions: insights.append( RawInsight( type=InsightType.category, value_tag=category, data={ "lang": lang, "model": "neural", "confidence": confidence }, )) return ProductInsights(barcode=product["code"], type=InsightType.category, insights=insights)
def extract_fishing_code(processor: KeywordProcessor, text: str) -> List[RawInsight]: insights = [] for (key, _), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.packager_code, value=key, predictor="flashtext", data={ "type": "fishing", "raw": match_str, "notify": False }, automatic_processing=True, )) return insights
def extract_brands(processor: KeywordProcessor, text: str, data_source_name: str) -> List[RawInsight]: insights = [] for (brand_tag, brand), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.brand, value=brand, value_tag=brand_tag, automatic_processing=False, predictor=data_source_name, data={ "text": match_str, "notify": False }, )) return insights
def find_expiration_date(content: Union[OCRResult, str]) -> List[RawInsight]: # Parse expiration date # "À consommer de préférence avant", results: List[RawInsight] = [] for type_, ocr_regex in EXPIRATION_DATE_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): raw = match.group(0) if not ocr_regex.processing_func: continue date = ocr_regex.processing_func(match) if date is None: continue if date.year > 2025 or date.year < 2015: continue # Format dates according to ISO 8601 value = date.strftime("%Y-%m-%d") results.append( RawInsight( value=value, type=InsightType.expiration_date, data={ "raw": raw, "type": type_, "notify": ocr_regex.notify }, )) return results
def get_image_lang(ocr_result: Union[OCRResult, str]) -> List[RawInsight]: if isinstance(ocr_result, str): return [] image_lang: Optional[Dict[str, int]] = ocr_result.get_languages() if image_lang is None: return [] words = image_lang["words"] percents = {} for key, count in image_lang.items(): if key == "words": continue percents[key] = count * 100 / words return [ RawInsight(type=InsightType.image_lang, data={ "count": image_lang, "percent": percents }) ]