Exemple #1
0
def test_product_insights_merge():
    insights_1 = [
        RawInsight(type=InsightType.label, data={}, value_tag="en:organic")
    ]
    product_insights_1 = ProductInsights(
        insights=insights_1,
        barcode="123",
        type=InsightType.label,
        source_image="/123/1.jpg",
    )

    insights_2 = [
        RawInsight(type=InsightType.label, data={}, value_tag="en:pgi")
    ]
    product_insights_2 = ProductInsights(
        insights=insights_2,
        barcode="123",
        type=InsightType.label,
        source_image="/123/1.jpg",
    )

    merged_product_insights = ProductInsights.merge(
        [product_insights_1, product_insights_2])

    assert merged_product_insights.type == InsightType.label
    assert merged_product_insights.barcode == "123"
    assert merged_product_insights.source_image == "/123/1.jpg"
    assert merged_product_insights.insights == insights_1 + insights_2
Exemple #2
0
def find_labels(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    for label_tag, regex_list in LABELS_REGEX.items():
        for ocr_regex in regex_list:
            text = get_text(content, ocr_regex)

            if not text:
                continue

            for match in ocr_regex.regex.finditer(text):
                if ocr_regex.processing_func:
                    label_value = ocr_regex.processing_func(match)

                    if label_value is None:
                        continue

                else:
                    label_value = label_tag

                insights.append(
                    RawInsight(
                        type=InsightType.label,
                        value_tag=label_value,
                        predictor="regex",
                        data={
                            "text": match.group(),
                            "notify": ocr_regex.notify
                        },
                    ))

    processor = LABEL_KEYWORD_PROCESSOR_STORE.get()

    text = get_text(content)
    insights += extract_label_flashtext(processor, text)

    if isinstance(content, OCRResult):
        for logo_annotation in content.logo_annotations:
            if logo_annotation.description in LOGO_ANNOTATION_LABELS:
                label_tag = LOGO_ANNOTATION_LABELS[logo_annotation.description]

                insights.append(
                    RawInsight(
                        type=InsightType.label,
                        value_tag=label_tag,
                        automatic_processing=False,
                        predictor="google-cloud-vision",
                        data={"confidence": logo_annotation.score},
                    ))

    return insights
Exemple #3
0
def flag_image(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights: List[RawInsight] = []

    text = get_text(content)
    insight = extract_image_flag_flashtext(PROCESSOR, text)

    if insight is not None:
        insights.append(insight)

    if isinstance(content, str):
        return insights

    safe_search_annotation = content.get_safe_search_annotation()
    label_annotations = content.get_label_annotations()

    if safe_search_annotation:
        for key in ("adult", "violence"):
            value: SafeSearchAnnotationLikelihood = getattr(safe_search_annotation, key)
            if value >= SafeSearchAnnotationLikelihood.VERY_LIKELY:
                insights.append(
                    RawInsight(
                        type=InsightType.image_flag,
                        data={
                            "type": "safe_search_annotation",
                            "label": key,
                            "likelihood": value.name,
                        },
                    )
                )

    for label_annotation in label_annotations:
        if (
            label_annotation.description in LABELS_TO_FLAG
            and label_annotation.score >= 0.6
        ):
            insights.append(
                RawInsight(
                    type=InsightType.image_flag,
                    data={
                        "type": "label_annotation",
                        "label": label_annotation.description.lower(),
                        "likelihood": label_annotation.score,
                    },
                )
            )
            break

    return insights
Exemple #4
0
def find_product_weight(content: Union[OCRResult, str]) -> List[RawInsight]:
    results = []

    for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                continue

            result = ocr_regex.processing_func(match)

            if result is None:
                continue

            result["matcher_type"] = type_
            result["priority"] = ocr_regex.priority
            result["notify"] = ocr_regex.notify
            value = result.pop("text")
            automatic_processing = result.pop("automatic_processing", None)
            results.append(
                RawInsight(
                    value=value,
                    type=InsightType.product_weight,
                    automatic_processing=automatic_processing,
                    data=result,
                ))

    return results
Exemple #5
0
def generate_raw_insight(logo_type: str, logo_value: Optional[str],
                         **kwargs) -> Optional[RawInsight]:
    if logo_type not in LOGO_TYPE_MAPPING:
        return None

    insight_type = LOGO_TYPE_MAPPING[logo_type]

    value_tag = None
    value = None

    if insight_type == InsightType.brand:
        value = logo_value
        if value is None:
            return None

    elif insight_type == InsightType.label:
        value_tag = logo_value
        if value_tag is None:
            return None

    return RawInsight(
        type=insight_type,
        value_tag=value_tag,
        value=value,
        automatic_processing=False,
        predictor="universal-logo-detector",
        data=kwargs,
    )
Exemple #6
0
def find_packaging(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    text = get_text(content)

    if not text:
        return []

    processor = KEYWORD_PROCESSOR_STORE.get()

    for (packaging_str, _), span_start, span_end in processor.extract_keywords(
        text, span_info=True
    ):
        packagings = packaging_str.split(";")

        for packaging in packagings:
            match_str = text[span_start:span_end]
            insights.append(
                RawInsight(
                    type=InsightType.packaging,
                    value_tag=get_tag(packaging),
                    value=packaging,
                    data={"text": match_str, "notify": False},
                    automatic_processing=True,
                )
            )

    return insights
def find_packager_codes_regex(
        ocr_result: Union[OCRResult, str]) -> List[RawInsight]:
    results: List[RawInsight] = []

    for regex_code, ocr_regex in PACKAGER_CODE.items():
        text = get_text(ocr_result, ocr_regex, ocr_regex.lowercase)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                value = match.group(0)
            else:
                value = ocr_regex.processing_func(match)

            results.append(
                RawInsight(
                    value=value,
                    data={
                        "raw": match.group(0),
                        "type": regex_code,
                        "notify": ocr_regex.notify,
                    },
                    type=InsightType.packager_code,
                    automatic_processing=True,
                ))

    return results
Exemple #8
0
def find_nutrient_mentions(content: Union[OCRResult, str]) -> List[RawInsight]:
    nutrients: JSONType = {}

    for regex_code, ocr_regex in NUTRIENT_MENTIONS_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            nutrients.setdefault(regex_code, [])
            group_dict = {k: v for k, v in match.groupdict().items() if v is not None}

            languages: List[str] = []
            if group_dict:
                languages_raw = list(group_dict.keys())[0]
                languages = languages_raw.rsplit("_", maxsplit=1)[0].split("_")

            nutrients[regex_code].append(
                {
                    "raw": match.group(0),
                    "span": list(match.span()),
                    "languages": languages,
                }
            )

    if not nutrients:
        return []

    return [
        RawInsight(
            type=InsightType.nutrient_mention,
            data={"mentions": nutrients, "version": EXTRACTOR_VERSION},
        )
    ]
Exemple #9
0
def find_traces(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    text = get_text(content, TRACES_REGEX)

    if not text:
        return []

    processor = TRACE_KEYWORD_PROCESSOR_STORE.get()

    for match in TRACES_REGEX.regex.finditer(text):
        prompt = match.group()
        end_idx = match.end()
        captured = text[end_idx : end_idx + 100]

        for (trace_tag, _), span_start, span_end in processor.extract_keywords(
            captured, span_info=True
        ):
            match_str = captured[span_start:span_end]
            insights.append(
                RawInsight(
                    type=InsightType.trace,
                    value_tag=trace_tag,
                    data={"text": match_str, "prompt": prompt, "notify": False},
                )
            )

    return insights
Exemple #10
0
def find_nutrient_values(content: Union[OCRResult, str]) -> List[RawInsight]:
    nutrients: JSONType = {}

    for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            value = match.group(2).replace(",", ".")
            unit = match.group(3)
            nutrients.setdefault(regex_code, [])
            nutrients[regex_code].append(
                {
                    "raw": match.group(0),
                    "nutrient": regex_code,
                    "value": value,
                    "unit": unit,
                }
            )

    if not nutrients:
        return []

    return [
        RawInsight(
            type=InsightType.nutrient,
            data={"nutrients": nutrients, "version": EXTRACTOR_VERSION},
        )
    ]
Exemple #11
0
def extract_nutriscore_label(
        image: Image.Image, manual_threshold: float,
        automatic_threshold: float) -> Optional[RawInsight]:
    model = ObjectDetectionModelRegistry.get("nutriscore")
    raw_result = model.detect_from_image(image, output_image=False)
    results = raw_result.select(threshold=manual_threshold)

    if not results:
        return None

    if len(results) > 1:
        logger.warn("more than one nutriscore detected, discarding detections")
        return None

    result = results[0]
    score = result.score

    automatic_processing = score >= automatic_threshold
    label_tag = NUTRISCORE_LABELS[result.label]

    return RawInsight(
        type=InsightType.label,
        value_tag=label_tag,
        automatic_processing=automatic_processing,
        data={
            "confidence": score,
            "bounding_box": result.bounding_box,
            "model": "nutriscore",
            "notify": True,
        },
    )
Exemple #12
0
def find_stores(content: Union[OCRResult, str]) -> List[RawInsight]:
    results = []

    text = get_text(content, STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()

        for idx, match_str in enumerate(groups):
            if match_str is not None:
                store, _ = SORTED_STORES[idx]
                results.append(
                    RawInsight(
                        type=InsightType.store,
                        value=store,
                        value_tag=get_store_tag(store),
                        data={
                            "text": match_str,
                            "notify": store in NOTIFY_STORES
                        },
                    ))
                break

    return results
Exemple #13
0
def _get_raw_insight(probabilily: float, index: int) -> RawInsight:
    return RawInsight(
        type=InsightType.category,
        value_tag=LIST_CATEGORIES[index],
        data={
            "confidence": round(probabilily, 4),
        },
        predictor="ridge_model-ml",
    )
Exemple #14
0
def extract_image_flag_flashtext(
    processor: KeywordProcessor, text: str
) -> Optional[RawInsight]:
    for (_, key), span_start, span_end in processor.extract_keywords(
        text, span_info=True
    ):
        match_str = text[span_start:span_end]
        return RawInsight(
            type=InsightType.image_flag,
            data={"text": match_str, "type": "text", "label": key},
        )

    return None
Exemple #15
0
def find_image_orientation(
        ocr_result: Union[OCRResult, str]) -> List[RawInsight]:
    if isinstance(ocr_result, str):
        return []

    orientation_result = ocr_result.get_orientation()

    if orientation_result is None:
        return []

    insight = orientation_result.to_json()
    insight["rotation"] = get_rotation_angle_from_orientation(
        orientation_result.orientation)
    return [RawInsight(type=InsightType.image_orientation, data=insight)]
Exemple #16
0
    def extract_addresses(self, content: Union[str,
                                               OCRResult]) -> List[RawInsight]:
        """Extract addresses from the given OCR result.

        Args:
            content (OCRResult or str): a string or the OCR result to process.

        Returns:
            list of RawInsight: List of addresses extracted from the text. Each entry
            is a dictionary with the items: country_code (always "fr"), city_name,
            postal_code and text_extract.
        """
        if isinstance(content, OCRResult):
            text = self.get_text(content)
        else:
            text = content

        text = self.normalize_text(text)
        city_matches = self.find_city_names(text)

        locations = []
        for city, city_start, city_end in city_matches:
            pc_match = self.find_nearby_postal_code(text, city, city_start,
                                                    city_end)
            if pc_match is None:
                continue

            pc, pc_start, pc_end = pc_match
            address_start = min(city_start,
                                pc_start) - self.text_extract_distance
            address_end = max(city_end, pc_end) + self.text_extract_distance
            text_extract = text[max(0, address_start
                                    ):min(len(text), address_end)]

            locations.append(
                RawInsight(
                    type=InsightType.location,
                    data={
                        "country_code": "fr",
                        "city_name": city.name,
                        "postal_code": city.postal_code,
                        "text_extract": text_extract,
                    },
                ))

        return locations
Exemple #17
0
def extract_label_flashtext(processor: KeywordProcessor, text: str) -> List[RawInsight]:
    insights = []

    for (label_tag, _), span_start, span_end in processor.extract_keywords(
        text, span_info=True
    ):
        match_str = text[span_start:span_end]
        insights.append(
            RawInsight(
                type=InsightType.label,
                value_tag=label_tag,
                automatic_processing=False,
                predictor="flashtext",
                data={"text": match_str, "notify": False},
            )
        )

    return insights
Exemple #18
0
def extract_brands_google_cloud_vision(ocr_result: OCRResult) -> List[RawInsight]:
    insights = []
    for logo_annotation in ocr_result.logo_annotations:
        if logo_annotation.description in LOGO_ANNOTATION_BRANDS:
            brand = LOGO_ANNOTATION_BRANDS[logo_annotation.description]

            insights.append(
                RawInsight(
                    type=InsightType.brand,
                    value=brand,
                    value_tag=get_tag(brand),
                    automatic_processing=False,
                    predictor="google-cloud-vision",
                    data={"confidence": logo_annotation.score, "notify": False},
                )
            )

    return insights
Exemple #19
0
def predict(client, product: Dict) -> Optional[ProductInsights]:
    predictions = []

    for lang in product.get("languages_codes", []):
        product_name = product.get("product_name_{}".format(lang))

        if not product_name:
            continue

        prediction = predict_category(client, product_name, lang)

        if prediction is None:
            continue

        category, score = prediction
        predictions.append((lang, category, product_name, score))
        continue

    if predictions:
        # Sort by descending score
        sorted_predictions = sorted(
            predictions, key=operator.itemgetter(2), reverse=True
        )

        p = sorted_predictions[0]
        lang, category, product_name, score = p

        return ProductInsights(
            barcode=product["code"],
            type=InsightType.category,
            insights=[
                RawInsight(
                    type=InsightType.category,
                    value_tag=category,
                    data={
                        "lang": lang,
                        "product_name": product_name,
                        "model": "matcher",
                    },
                )
            ],
        )

    return None
Exemple #20
0
def format_predictions(product: Dict, predictions: List[CategoryPrediction],
                       lang: str) -> ProductInsights:
    insights = []

    for category, confidence in predictions:
        insights.append(
            RawInsight(
                type=InsightType.category,
                value_tag=category,
                data={
                    "lang": lang,
                    "model": "neural",
                    "confidence": confidence
                },
            ))

    return ProductInsights(barcode=product["code"],
                           type=InsightType.category,
                           insights=insights)
def extract_fishing_code(processor: KeywordProcessor,
                         text: str) -> List[RawInsight]:
    insights = []

    for (key, _), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        insights.append(
            RawInsight(
                type=InsightType.packager_code,
                value=key,
                predictor="flashtext",
                data={
                    "type": "fishing",
                    "raw": match_str,
                    "notify": False
                },
                automatic_processing=True,
            ))

    return insights
Exemple #22
0
def extract_brands(processor: KeywordProcessor, text: str,
                   data_source_name: str) -> List[RawInsight]:
    insights = []

    for (brand_tag, brand), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        insights.append(
            RawInsight(
                type=InsightType.brand,
                value=brand,
                value_tag=brand_tag,
                automatic_processing=False,
                predictor=data_source_name,
                data={
                    "text": match_str,
                    "notify": False
                },
            ))

    return insights
Exemple #23
0
def find_expiration_date(content: Union[OCRResult, str]) -> List[RawInsight]:
    # Parse expiration date
    #        "À consommer de préférence avant",
    results: List[RawInsight] = []

    for type_, ocr_regex in EXPIRATION_DATE_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            raw = match.group(0)

            if not ocr_regex.processing_func:
                continue

            date = ocr_regex.processing_func(match)

            if date is None:
                continue

            if date.year > 2025 or date.year < 2015:
                continue

            # Format dates according to ISO 8601
            value = date.strftime("%Y-%m-%d")

            results.append(
                RawInsight(
                    value=value,
                    type=InsightType.expiration_date,
                    data={
                        "raw": raw,
                        "type": type_,
                        "notify": ocr_regex.notify
                    },
                ))

    return results
Exemple #24
0
def get_image_lang(ocr_result: Union[OCRResult, str]) -> List[RawInsight]:
    if isinstance(ocr_result, str):
        return []

    image_lang: Optional[Dict[str, int]] = ocr_result.get_languages()

    if image_lang is None:
        return []

    words = image_lang["words"]
    percents = {}
    for key, count in image_lang.items():
        if key == "words":
            continue

        percents[key] = count * 100 / words

    return [
        RawInsight(type=InsightType.image_lang,
                   data={
                       "count": image_lang,
                       "percent": percents
                   })
    ]