Example #1
0
def process_es_bio_label_code(match) -> str:
    return ("en:es-eco-{}-{}".format(match.group(1), match.group(2)).lower())


EN_ORGANIC_REGEX_STR = [
    r'ingr[ée]dients?\sbiologiques?',
    r'ingr[ée]dients?\sbio[\s.,)]',
    r'agriculture ue/non ue biologique',
    r'agriculture bio(?:logique)?[\s.,)]',
    r'production bio(?:logique)?[\s.,)]',
]

LABELS_REGEX = {
    'en:organic': [
        OCRRegex(re.compile(r"|".join(
            [r"(?:{})".format(x) for x in EN_ORGANIC_REGEX_STR])),
                 field=OCRField.full_text_contiguous,
                 lowercase=True),
    ],
    'xx-bio-xx': [
        # The negative lookbehind (?<![a-zA-Z]) is useful to avoid to match
        # strings if additional chars are before the label
        OCRRegex(re.compile(
            r"(?<![a-zA-Z])([A-Z]{2})[\-\s.](BIO|ÖKO|OKO|EKO|ØKO|ORG|Bio)[\-\s.](\d{2,3})"
        ),
                 field=OCRField.text_annotations,
                 lowercase=False,
                 processing_func=process_eu_bio_label_code),
        # Spain specific regex
        OCRRegex(re.compile(
            r"(?<![a-zA-Z])ES[\-\s.]ECO[\-\s.](\d{3})[\-\s.]([A-Z]{2,3})"),
                 field=OCRField.text_annotations,
Example #2
0
        else:
            store = item
            regex_str = re.escape(item.lower())

        sorted_stores[store] = regex_str

    return sorted(sorted_stores.items(), key=store_sort_key)


SORTED_STORES = get_sorted_stores()
STORE_REGEX_STR = "|".join(r"((?<!\w){}(?!\w))".format(pattern)
                           for _, pattern in SORTED_STORES)
NOTIFY_STORES: Set[str] = set(
    text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH))
STORE_REGEX = OCRRegex(re.compile(STORE_REGEX_STR),
                       field=OCRField.full_text_contiguous,
                       lowercase=True)


def find_stores(ocr_result: OCRResult) -> List[Dict]:
    results = []

    text = ocr_result.get_text(STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()

        for idx, match_str in enumerate(groups):
Example #3
0
    return "en:es-eco-{}-{}".format(match.group(1), match.group(2)).lower()


EN_ORGANIC_REGEX_STR = [
    r"ingr[ée]dients?\sbiologiques?",
    r"ingr[ée]dients?\sbio[\s.,)]",
    r"agriculture ue/non ue biologique",
    r"agriculture bio(?:logique)?[\s.,)]",
    r"production bio(?:logique)?[\s.,)]",
]

LABELS_REGEX = {
    "en:organic": [
        OCRRegex(
            re.compile(r"|".join(
                [r"(?:{})".format(x) for x in EN_ORGANIC_REGEX_STR])),
            field=OCRField.full_text_contiguous,
            lowercase=True,
        ),
    ],
    "xx-bio-xx": [
        # The negative lookbehind (?<![a-zA-Z]) is useful to avoid to match
        # strings if additional chars are before the label
        OCRRegex(
            re.compile(
                r"(?<![a-zA-Z])([A-Z]{2})[\-\s.](BIO|ÖKO|OKO|EKO|ØKO|ORG|Bio)[\-\s.](\d{2,3})"
            ),
            field=OCRField.text_annotations,
            lowercase=False,
            processing_func=process_eu_bio_label_code,
        ),
        # Spain specific regex
Example #4
0
        format_str: str = "%d/%m/%y"
    else:
        format_str = "%d/%m/%Y"

    try:
        date = datetime.datetime.strptime("{}/{}/{}".format(day, month, year), format_str).date()
    except ValueError:
        return None

    return date


EXPIRATION_DATE_REGEX: Dict[str, OCRRegex] = {
    'full_digits_short': OCRRegex(re.compile(r'(?<!\d)(\d{2})[-./](\d{2})[-./](\d{2})(?!\d)'),
                                  field=OCRField.full_text,
                                  lowercase=False,
                                  processing_func=functools.partial(process_full_digits_expiration_date,
                                                                    short=True)),
    'full_digits_long': OCRRegex(re.compile(r'(?<!\d)(\d{2})[-./](\d{2})[-./](\d{4})(?!\d)'),
                                 field=OCRField.full_text,
                                 lowercase=False,
                                 processing_func=functools.partial(process_full_digits_expiration_date,
                                                                   short=False)),
}


def find_expiration_date(ocr_result: OCRResult) -> List[Dict]:
    # Parse expiration date
    #        "À consommer de préférence avant",
    results = []
Example #5
0
    city_code, company_code = match.group(1, 2)
    city_code = city_code.replace(" ", "")
    company_code = company_code or ""
    return "EMB {}{}".format(city_code, company_code).upper()


def process_fsc_match(match) -> str:
    fsc_code = match.group(1)
    return "FSC-{}".format(fsc_code).upper()


PACKAGER_CODE: Dict[str, OCRRegex] = {
    "fr_emb":
    OCRRegex(
        re.compile(r"emb ?(\d ?\d ?\d ?\d ?\d) ?([a-z])?(?![a-z0-9])"),
        field=OCRField.text_annotations,
        lowercase=True,
        processing_func=process_fr_emb_match,
    ),
    "fsc":
    OCRRegex(
        re.compile(r"fsc.? ?(c\d{6})"),
        field=OCRField.text_annotations,
        lowercase=True,
        processing_func=process_fsc_match,
    ),
    "eu_fr":
    OCRRegex(
        re.compile(
            r"fr (\d{2,3}|2[ab])[\-\s.](\d{3})[\-\s.](\d{3}) (ce|ec)(?![a-z0-9])"
        ),
        field=OCRField.full_text_contiguous,
Example #6
0
        )
    )


def generate_nutrient_mention_regex(nutrient_mentions: List[NutrientMentionType]):
    sub_re = "|".join(
        r"(?P<{}>{})".format("{}_{}".format("_".join(lang), i), name)
        for i, (name, lang) in enumerate(nutrient_mentions)
    )
    return re.compile(r"(?<!\w){}(?!\w)".format(sub_re))


NUTRIENT_VALUES_REGEX = {
    nutrient: OCRRegex(
        generate_nutrient_regex(NUTRIENT_MENTION[nutrient], units),
        field=OCRField.full_text_contiguous,
        lowercase=True,
    )
    for nutrient, units in NUTRIENT_UNITS.items()
}

NUTRIENT_MENTIONS_REGEX: Dict[str, OCRRegex] = {
    nutrient: OCRRegex(
        generate_nutrient_mention_regex(NUTRIENT_MENTION[nutrient]),
        field=OCRField.full_text_contiguous,
        lowercase=True,
    )
    for nutrient in NUTRIENT_MENTION
}

Example #7
0
    if is_suspicious_weight(normalized_value, normalized_unit):
        # Don't process the insight automatically if the value
        # is suspiciously high
        result["automatic_processing"] = False

    return result


PRODUCT_WEIGHT_REGEX: Dict[str, OCRRegex] = {
    "with_mention":
    OCRRegex(
        re.compile(
            r"(?<![a-z])(poids|poids net [aà] l'emballage|poids net|poids net égoutté|masse nette|volume net total|net weight|net wt\.?|peso neto|peso liquido|netto[ -]?gewicht)\s?:?\s?([0-9]+[,.]?[0-9]*)\s?(fl oz|dle?|cle?|mge?|mle?|lbs|oz|ge?|kge?|le?)(?![a-z])"
        ),
        field=OCRField.full_text_contiguous,
        lowercase=True,
        processing_func=functools.partial(process_product_weight, prompt=True),
        priority=1,
    ),
    "with_ending_mention":
    OCRRegex(
        re.compile(
            r"(?<![a-z])([0-9]+[,.]?[0-9]*)\s?(fl oz|dle?|cle?|mge?|mle?|lbs|oz|ge?|kge?|le?)\s(net)(?![a-z])"
        ),
        field=OCRField.full_text_contiguous,
        lowercase=True,
        processing_func=functools.partial(process_product_weight,
                                          prompt=True,
                                          ending_prompt=True),
        priority=1,
Example #8
0
import re
from typing import List, Dict

from robotoff.insights.ocr.dataclass import OCRRegex, OCRField, OCRResult

TRACES_REGEX = OCRRegex(re.compile(
    r"(?:possibilit[ée] de traces|peut contenir(?: des traces)?|traces? [ée]ventuelles? de)"
),
                        field=OCRField.full_text_contiguous,
                        lowercase=True)


def find_traces(ocr_result: OCRResult) -> List[Dict]:
    results = []

    text = ocr_result.get_text(TRACES_REGEX)

    if not text:
        return []

    for match in TRACES_REGEX.regex.finditer(text):
        raw = match.group()
        end_idx = match.end()
        captured = text[end_idx:end_idx + 100]

        result = {
            'raw': raw,
            'text': captured,
            'notify': TRACES_REGEX.notify,
        }
        results.append(result)
Example #9
0
    nutrient_names_str = "|".join(nutrient_names)
    units_str = "|".join(units)
    return re.compile(
        r"(?<!\w)({}) ?(?:[:-] ?)?([0-9]+[,.]?[0-9]*) ?({})(?!\w)".format(
            nutrient_names_str, units_str))


NUTRIENT_VALUES_REGEX = {
    "energy":
    OCRRegex(
        generate_nutrient_regex(
            [
                "[ée]nergie",  # fr/de
                "energy",  # en
                "calories",  # fr/en
                "energia",  # es
                "valor energ[ée]tico",  # es
            ],
            ["kj", "kcal"],
        ),
        field=OCRField.full_text_contiguous,
        lowercase=True,
    ),
    "saturated_fat":
    OCRRegex(
        generate_nutrient_regex(
            [
                "mati[èe]res? grasses? satur[ée]s?",  # fr
                "acides? gras satur[ée]s?",  # fr
                "saturated fat",  # en
                "of which saturates",  # en
                "verzadigde vetzuren",  # nl
Example #10
0
from robotoff.insights.ocr.dataclass import OCRResult, OCRRegex, OCRField


def generate_nutrient_regex(nutrient_names: List[str], units: List[str]):
    nutrient_names_str = '|'.join(nutrient_names)
    units_str = '|'.join(units)
    return re.compile(
        r"(?<!\w)({}) ?(?:[:-] ?)?([0-9]+[,.]?[0-9]*) ?({})(?!\w)".format(
            nutrient_names_str, units_str))


NUTRIENT_VALUES_REGEX = {
    'energy':
    OCRRegex(generate_nutrient_regex(["[ée]nergie", "energy", "calories"],
                                     ["kj", "kcal"]),
             field=OCRField.full_text_contiguous,
             lowercase=True),
    'saturated_fat':
    OCRRegex(generate_nutrient_regex(
        ["mati[èe]res? grasses? satur[ée]s?", "saturated fat"], ["g"]),
             field=OCRField.full_text_contiguous,
             lowercase=True),
    'trans_fat':
    OCRRegex(generate_nutrient_regex(
        ["mati[èe]res? grasses? trans", "trans fat"], ["g"]),
             field=OCRField.full_text_contiguous,
             lowercase=True),
    'fat':
    OCRRegex(generate_nutrient_regex(["mati[èe]res? grasses?", "total fat"],
                                     ["g"]),
             field=OCRField.full_text_contiguous,
Example #11
0
import re
from typing import List, Dict

from robotoff.insights.ocr.dataclass import OCRResult, OCRRegex, OCRField


def generate_nutrient_regex(nutrient_names: List[str], units: List[str]):
    nutrient_names_str = '|'.join(nutrient_names)
    units_str = '|'.join(units)
    return re.compile(r"({}) ?(?:[:-] ?)?([0-9]+[,.]?[0-9]*) ?({})".format(nutrient_names_str,
                                                                           units_str))


NUTRIENT_VALUES_REGEX = {
    'energy': OCRRegex(
        generate_nutrient_regex(["[ée]nergie", "energy"], ["kj", "kcal"]),
        field=OCRField.full_text_contiguous,
        lowercase=True),
    'fat': OCRRegex(
        generate_nutrient_regex(["mati[èe]res? grasses?"], ["g"]),
        field=OCRField.full_text_contiguous,
        lowercase=True),
    'glucid': OCRRegex(
        generate_nutrient_regex(["glucides?", "glucids?"], ["g"]),
        field=OCRField.full_text_contiguous,
        lowercase=True),
    'carbohydrate': OCRRegex(
        generate_nutrient_regex(["sucres?", "carbohydrates?"], ["g"]),
        field=OCRField.full_text_contiguous,
        lowercase=True),
}
Example #12
0
from robotoff.insights.ocr.utils import generate_keyword_processor
from robotoff.utils import text_file_iter
from robotoff.utils.cache import CachedStore


def generate_trace_keyword_processor(labels: Optional[List[str]] = None):
    if labels is None:
        labels = list(text_file_iter(settings.OCR_TRACE_ALLERGEN_DATA_PATH))

    return generate_keyword_processor(labels)


TRACES_REGEX = OCRRegex(
    re.compile(
        r"(?:possibilit[ée] de traces|conditionné dans un atelier qui manipule|peut contenir(?: des traces)?|traces? [ée]ventuelles? d[e']|traces? d[e']|may contain)"
    ),
    field=OCRField.full_text_contiguous,
    lowercase=True,
)

TRACE_KEYWORD_PROCESSOR_STORE = CachedStore(
    fetch_func=generate_trace_keyword_processor, expiration_interval=None
)


def find_traces(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    text = get_text(content, TRACES_REGEX)

    if not text: