def extract_brands( processor: KeywordProcessor, text: str, data_source_name: str, automatic_processing: bool, ) -> List[Prediction]: predictions = [] for (brand_tag, brand), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] predictions.append( Prediction( type=PredictionType.brand, value=brand, value_tag=brand_tag, automatic_processing=automatic_processing, predictor=data_source_name, data={ "text": match_str, "notify": False }, )) return predictions
def get_image_lang(ocr_result: Union[OCRResult, str]) -> List[Prediction]: if isinstance(ocr_result, str): return [] image_lang: Optional[Dict[str, int]] = ocr_result.get_languages() if image_lang is None: return [] words = image_lang["words"] percents = {} for key, count in image_lang.items(): if key == "words": continue percents[key] = count * 100 / words return [ Prediction( type=PredictionType.image_lang, data={ "count": image_lang, "percent": percents }, ) ]
def find_product_weight(content: Union[OCRResult, str]) -> List[Prediction]: results = [] for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: continue result = ocr_regex.processing_func(match) if result is None: continue result["matcher_type"] = type_ result["priority"] = ocr_regex.priority result["notify"] = ocr_regex.notify value = result.pop("text") results.append( Prediction( value=value, type=PredictionType.product_weight, automatic_processing=result["automatic_processing"], data=result, )) return results
def test_import_insights_single_product(self, mocker): prediction_dict = { "barcode": DEFAULT_BARCODE, "type": PredictionType.category.name, "data": {}, } prediction = Prediction( barcode=DEFAULT_BARCODE, type=PredictionType.category, data={}, ) get_product_predictions_mock = mocker.patch( "robotoff.insights.importer.get_product_predictions", return_value=[ prediction_dict, ], ) import_insights_mock = mocker.patch( "robotoff.insights.importer.InsightImporter.import_insights", return_value=1, ) product_store = FakeProductStore() imported = import_insights_for_products( {DEFAULT_BARCODE: {PredictionType.category}}, DEFAULT_SERVER_DOMAIN, automatic=True, product_store=product_store, ) assert imported == 1 get_product_predictions_mock.assert_called_once() import_insights_mock.assert_called_once_with([prediction], DEFAULT_SERVER_DOMAIN, True, product_store)
def test_notify_image_flag_public(mocker, monkeypatch): """Test notifying a potentially sensitive public image""" mock = mocker.patch("robotoff.slack.http_session.post", return_value=MockSlackResponse()) monkeypatch.delenv("ROBOTOFF_SCHEME", raising=False) # force defaults to apply notifier = slack.SlackNotifier("") notifier.notify_image_flag( [ Prediction( type=PredictionType.image_flag, data={ "text": "bad_word", "type": "SENSITIVE", "label": "flagged" }, ) ], "/source_image", "123", ) mock.assert_called_once_with( notifier.POST_MESSAGE_URL, data=PartialRequestMatcher( f"type: SENSITIVE\nlabel: *flagged*, match: bad_word\n\n <{settings.OFF_IMAGE_BASE_URL}/source_image|Image> -- <https://world.{settings._robotoff_domain}/cgi/product.pl?type=edit&code=123|*Edit*>", notifier.ROBOTOFF_PUBLIC_IMAGE_ALERT_CHANNEL, f"{settings.OFF_IMAGE_BASE_URL}/source_image", ), )
def test_notify_image_flag_private(mocker, monkeypatch): """Test notifying a potentially sensitive private image""" mock = mocker.patch("robotoff.slack.http_session.post", return_value=MockSlackResponse()) monkeypatch.delenv("ROBOTOFF_SCHEME", raising=False) # force defaults to apply notifier = slack.SlackNotifier("") notifier.notify_image_flag( [ Prediction( type=PredictionType.image_flag, data={ "type": "label_annotation", "label": "face", "likelihood": 0.8 }, ) ], "/source_image", "123", ) mock.assert_called_once_with( notifier.POST_MESSAGE_URL, data=PartialRequestMatcher( f"type: label_annotation\nlabel: *face*, score: 0.8\n\n <{settings.OFF_IMAGE_BASE_URL}/source_image|Image> -- <https://world.{settings._robotoff_domain}/cgi/product.pl?type=edit&code=123|*Edit*>", notifier.ROBOTOFF_PRIVATE_IMAGE_ALERT_CHANNEL, f"{settings.OFF_IMAGE_BASE_URL}/source_image", ), )
def find_packaging(content: Union[OCRResult, str]) -> List[Prediction]: predictions = [] text = get_text(content) if not text: return [] processor = KEYWORD_PROCESSOR_STORE.get() for (packaging_str, _), span_start, span_end in processor.extract_keywords( text, span_info=True): packagings = packaging_str.split(";") for packaging in packagings: match_str = text[span_start:span_end] predictions.append( Prediction( type=PredictionType.packaging, value_tag=get_tag(packaging), value=packaging, data={ "text": match_str, "notify": False }, automatic_processing=True, )) return predictions
def find_traces(content: Union[OCRResult, str]) -> List[Prediction]: predictions = [] text = get_text(content, TRACES_REGEX) if not text: return [] processor = TRACE_KEYWORD_PROCESSOR_STORE.get() for match in TRACES_REGEX.regex.finditer(text): prompt = match.group() end_idx = match.end() captured = text[end_idx:end_idx + 100] for (trace_tag, _), span_start, span_end in processor.extract_keywords( captured, span_info=True): match_str = captured[span_start:span_end] predictions.append( Prediction( type=PredictionType.trace, value_tag=trace_tag, data={ "text": match_str, "prompt": prompt, "notify": False }, )) return predictions
def find_stores(content: Union[OCRResult, str]) -> List[Prediction]: results = [] text = get_text(content, STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups() for idx, match_str in enumerate(groups): if match_str is not None: store, _ = SORTED_STORES[idx] results.append( Prediction( type=PredictionType.store, value=store, value_tag=get_store_tag(store), data={"text": match_str, "notify": store in NOTIFY_STORES}, ) ) break return results
def find_packager_codes_regex( ocr_result: Union[OCRResult, str]) -> List[Prediction]: results: List[Prediction] = [] for regex_code, ocr_regex in PACKAGER_CODE.items(): text = get_text(ocr_result, ocr_regex, ocr_regex.lowercase) if not text: continue for match in ocr_regex.regex.finditer(text): if ocr_regex.processing_func is None: value = match.group(0) else: value = ocr_regex.processing_func(match) results.append( Prediction( value=value, data={ "raw": match.group(0), "type": regex_code, "notify": ocr_regex.notify, }, type=PredictionType.packager_code, automatic_processing=True, )) return results
def generate_insights( self, max_errors: Optional[int] = None, lang: str = "fr", limit: Optional[int] = None, ) -> Iterable[Prediction]: dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_text_field( "lang", lang).filter_nonempty_text_field("ingredients_text_fr").iter()) insights_count = 0 for product in product_iter: if self.is_product_valid(product, max_errors=max_errors): insight = self.predict_insight(product["ingredients_text_fr"]) if insight is not None: insight["lang"] = lang yield Prediction( type=PredictionType.ingredient_spellcheck, data=insight, barcode=product["code"], ) insights_count += 1 if limit is not None and insights_count >= limit: break
def generate_prediction( logo_type: str, logo_value: Optional[str], automatic_processing: Optional[bool] = False, **kwargs, ) -> Optional[Prediction]: if logo_type not in LOGO_TYPE_MAPPING: return None prediction_type = LOGO_TYPE_MAPPING[logo_type] value_tag = None value = None if prediction_type == PredictionType.brand: value_tag = value = logo_value if value is None: return None elif prediction_type == PredictionType.label: value_tag = logo_value if value_tag is None: return None return Prediction( type=prediction_type, value_tag=value_tag, value=value, automatic_processing=automatic_processing, predictor="universal-logo-detector", data=kwargs, )
def import_insights_for_products( prediction_types_by_barcode: Dict[str, Set[PredictionType]], server_domain: str, automatic: bool, product_store: DBProductStore, ) -> int: """Re-compute insights for products with new predictions. :param prediction_types_by_barcode: a dict that associates each barcode with a set of prediction type that were updated :return: Number of imported insights """ imported = 0 for importer in IMPORTERS: required_prediction_types = importer.get_required_prediction_types() selected_barcodes: List[str] = [] for barcode, prediction_types in prediction_types_by_barcode.items(): if prediction_types >= required_prediction_types: selected_barcodes.append(barcode) if selected_barcodes: predictions = [ Prediction(**p) for p in get_product_predictions( selected_barcodes, list(required_prediction_types)) ] imported += importer.import_insights(predictions, server_domain, automatic, product_store) return imported
def find_nutrient_values(content: Union[OCRResult, str]) -> List[Prediction]: nutrients: JSONType = {} for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items(): text = get_text(content, ocr_regex) if not text: continue for match in ocr_regex.regex.finditer(text): value = match.group(2).replace(",", ".") unit = match.group(3) nutrients.setdefault(regex_code, []) nutrients[regex_code].append({ "raw": match.group(0), "nutrient": regex_code, "value": value, "unit": unit, }) if not nutrients: return [] return [ Prediction( type=PredictionType.nutrient, data={ "nutrients": nutrients, "version": EXTRACTOR_VERSION }, ) ]
def test_add_category_insight_with_ml_insights(mocker): expected_prediction = Prediction( barcode="123", type=PredictionType.category, value_tag="en:chicken", data={ "lang": "xx", "model": "neural", "confidence": 0.9 }, automatic_processing=True, ) mocker.patch( "robotoff.workers.tasks.product_updated.predict_category_from_product_es", return_value=None, ) mocker.patch( "robotoff.workers.tasks.product_updated.CategoryClassifier.predict", return_value=[expected_prediction], ) import_insights_mock = mocker.patch( "robotoff.workers.tasks.product_updated.import_insights", return_value=1, ) server_domain = settings.BaseURLProvider().get() imported = add_category_insight("123", {"code": "123"}, server_domain) import_insights_mock.assert_called_once_with( [ Prediction( barcode="123", type=PredictionType.category, value_tag="en:chicken", data={ "lang": "xx", "model": "neural", "confidence": 0.9 }, automatic_processing=True, ) ], server_domain, automatic=True, ) assert imported
def to_prediction(self) -> Prediction: """Converts this category prediction to a Prediction.""" return Prediction( type=PredictionType.category, value_tag=self.category, data={"lang": "xx", "model": "neural", "confidence": self.confidence}, automatic_processing=self.confidence >= self.NEURAL_CONFIDENCE_THRESHOLD, )
def flag_image(content: Union[OCRResult, str]) -> List[Prediction]: predictions: List[Prediction] = [] text = get_text(content) prediction = extract_image_flag_flashtext(PROCESSOR, text) if prediction is not None: predictions.append(prediction) if isinstance(content, str): return predictions safe_search_annotation = content.get_safe_search_annotation() label_annotations = content.get_label_annotations() if safe_search_annotation: for key in ("adult", "violence"): value: SafeSearchAnnotationLikelihood = getattr( safe_search_annotation, key) if value >= SafeSearchAnnotationLikelihood.VERY_LIKELY: predictions.append( Prediction( type=PredictionType.image_flag, data={ "type": "safe_search_annotation", "label": key, "likelihood": value.name, }, )) for label_annotation in label_annotations: if (label_annotation.description in LABELS_TO_FLAG and label_annotation.score >= 0.6): predictions.append( Prediction( type=PredictionType.image_flag, data={ "type": "label_annotation", "label": label_annotation.description.lower(), "likelihood": label_annotation.score, }, )) break return predictions
def generate_prediction(value, data: Dict[str, Any], automatic_processing: Optional[bool] = None): return Prediction( barcode=DEFAULT_BARCODE, value=value, type=PredictionType.product_weight, data=data, automatic_processing=automatic_processing, predictor="ocr", )
def matcher_prediction(category): return Prediction( barcode=barcode1, type=PredictionType.category, value_tag=category, data={ "lang": "en", "product_name": "test", "model": "matcher", }, automatic_processing=False, )
def neural_prediction(category, confidence=0.7, auto=False): return Prediction( barcode=barcode1, type=PredictionType.category, value_tag=category, data={ "lang": "xx", "model": "neural", "confidence": confidence }, automatic_processing=auto, )
def test_generate_candidates(self): prediction = Prediction(type=PredictionType.packager_code, value="fr 40.261.001 ce") selected = list( PackagerCodeInsightImporter.generate_candidates( Product({"emb_codes_tags": ["FR 50.200.000 CE"]}), [prediction], )) assert len(selected) == 1 insight = selected[0] assert isinstance(insight, ProductInsight) assert insight.value == prediction.value assert insight.type == InsightType.packager_code
def test_category_prediction_to_prediction(): category_prediction = CategoryPrediction("category", 0.5) assert category_prediction.to_prediction() == Prediction( type=InsightType.category, value_tag="category", data={ "lang": "xx", "model": "neural", "confidence": 0.5 }, automatic_processing=False, )
def test_category_prediction_to_prediction_auto(monkeypatch): monkeypatch.setattr(CategoryPrediction, "NEURAL_CONFIDENCE_THRESHOLD", 0.9) category_prediction = CategoryPrediction("category", 0.9) assert category_prediction.to_prediction() == Prediction( type=InsightType.category, value_tag="category", data={ "lang": "xx", "model": "neural", "confidence": 0.9 }, automatic_processing=True, )
def find_image_orientation( ocr_result: Union[OCRResult, str]) -> List[Prediction]: if isinstance(ocr_result, str): return [] orientation_result = ocr_result.get_orientation() if orientation_result is None: return [] prediction = orientation_result.to_json() prediction["rotation"] = get_rotation_angle_from_orientation( orientation_result.orientation) return [Prediction(type=PredictionType.image_orientation, data=prediction)]
def test_import_insights_invalid_types(self): class FakeImporter(InsightImporter): @staticmethod def get_required_prediction_types(): return {PredictionType.category, PredictionType.image_flag} with pytest.raises(ValueError, match="unexpected prediction type: 'label'"): FakeImporter.import_insights( [Prediction(type=PredictionType.label)], DEFAULT_SERVER_DOMAIN, automatic=True, product_store=FakeProductStore(), )
def extract_image_flag_flashtext(processor: KeywordProcessor, text: str) -> Optional[Prediction]: for (_, key), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] return Prediction( type=PredictionType.image_flag, data={ "text": match_str, "type": "text", "label": key }, ) return None
def predict(client, product: Dict) -> Optional[Prediction]: """Predict product categories using ES. :param elasticsearch.Elasticsearch client: connection to ES instance :param product: product properties :return: a category Prediction or None if no prediction was available """ predictions = [] for lang in product.get("languages_codes", []): product_name = product.get("product_name_{}".format(lang)) if not product_name: continue prediction = predict_category(client, product_name, lang) if prediction is None: continue category, score = prediction predictions.append((lang, category, product_name, score)) continue if predictions: # Sort by descending score sorted_predictions = sorted(predictions, key=operator.itemgetter(2), reverse=True) p = sorted_predictions[0] lang, category, product_name, score = p return Prediction( type=PredictionType.category, barcode=product["code"], value_tag=category, data={ "lang": lang, "product_name": product_name, "model": "matcher", }, automatic_processing=False, ) return None
def extract_addresses(self, content: Union[str, OCRResult]) -> List[Prediction]: """Extract addresses from the given OCR result. Args: content (OCRResult or str): a string or the OCR result to process. Returns: list of Prediction: List of addresses extracted from the text. Each entry is a dictionary with the items: country_code (always "fr"), city_name, postal_code and text_extract. """ if isinstance(content, OCRResult): text = self.get_text(content) else: text = content text = self.normalize_text(text) city_matches = self.find_city_names(text) locations = [] for city, city_start, city_end in city_matches: pc_match = self.find_nearby_postal_code(text, city, city_start, city_end) if pc_match is None: continue pc, pc_start, pc_end = pc_match address_start = min(city_start, pc_start) - self.text_extract_distance address_end = max(city_end, pc_end) + self.text_extract_distance text_extract = text[max(0, address_start ):min(len(text), address_end)] locations.append( Prediction( type=PredictionType.location, data={ "country_code": "fr", "city_name": city.name, "postal_code": city.postal_code, "text_extract": text_extract, }, )) return locations
def test_generate_insights_missing_product_no_references(self, mocker): get_existing_insight_mock = mocker.patch( "robotoff.insights.importer.get_existing_insight", return_value=[]) assert (list( InsightImporter.generate_insights( [ Prediction( type=PredictionType.category, barcode=DEFAULT_BARCODE, data={}, ) ], DEFAULT_SERVER_DOMAIN, automatic=True, product_store=FakeProductStore(), )) == []) get_existing_insight_mock.assert_called_once()
def refresh_insights( barcode: str, server_domain: str, automatic: bool, product_store: Optional[DBProductStore] = None, ) -> int: """Refresh all insights for specific product. All predictions are fetched, and insights are created/deleted by each InsightImporter. This is different from `import_insights`, because here, there is no prediction creation. It's just an refresh based on current database predictions. It's useful to refresh insights after an Product Opener update (some insights may be invalid). :param barcode: Barcode of the product. :param server_domain: The server domain associated with the predictions. :param automatic: If False, no insight is applied automatically. :param product_store: The product store to use, defaults to None :return: The number of imported insights. """ if product_store is None: product_store = get_product_store() predictions = [Prediction(**p) for p in get_product_predictions([barcode])] prediction_types = set(p.type for p in predictions) imported = 0 for importer in IMPORTERS: required_prediction_types = importer.get_required_prediction_types() if prediction_types >= required_prediction_types: imported += importer.import_insights( [ p for p in predictions if p.type in required_prediction_types ], server_domain, automatic, product_store, ) return imported