def process_es_bio_label_code(match) -> str: return ("en:es-eco-{}-{}".format(match.group(1), match.group(2)).lower()) EN_ORGANIC_REGEX_STR = [ r'ingr[ée]dients?\sbiologiques?', r'ingr[ée]dients?\sbio[\s.,)]', r'agriculture ue/non ue biologique', r'agriculture bio(?:logique)?[\s.,)]', r'production bio(?:logique)?[\s.,)]', ] LABELS_REGEX = { 'en:organic': [ OCRRegex(re.compile(r"|".join( [r"(?:{})".format(x) for x in EN_ORGANIC_REGEX_STR])), field=OCRField.full_text_contiguous, lowercase=True), ], 'xx-bio-xx': [ # The negative lookbehind (?<![a-zA-Z]) is useful to avoid to match # strings if additional chars are before the label OCRRegex(re.compile( r"(?<![a-zA-Z])([A-Z]{2})[\-\s.](BIO|ÖKO|OKO|EKO|ØKO|ORG|Bio)[\-\s.](\d{2,3})" ), field=OCRField.text_annotations, lowercase=False, processing_func=process_eu_bio_label_code), # Spain specific regex OCRRegex(re.compile( r"(?<![a-zA-Z])ES[\-\s.]ECO[\-\s.](\d{3})[\-\s.]([A-Z]{2,3})"), field=OCRField.text_annotations,
else: store = item regex_str = re.escape(item.lower()) sorted_stores[store] = regex_str return sorted(sorted_stores.items(), key=store_sort_key) SORTED_STORES = get_sorted_stores() STORE_REGEX_STR = "|".join(r"((?<!\w){}(?!\w))".format(pattern) for _, pattern in SORTED_STORES) NOTIFY_STORES: Set[str] = set( text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH)) STORE_REGEX = OCRRegex(re.compile(STORE_REGEX_STR), field=OCRField.full_text_contiguous, lowercase=True) def find_stores(ocr_result: OCRResult) -> List[Dict]: results = [] text = ocr_result.get_text(STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups() for idx, match_str in enumerate(groups):
return "en:es-eco-{}-{}".format(match.group(1), match.group(2)).lower() EN_ORGANIC_REGEX_STR = [ r"ingr[ée]dients?\sbiologiques?", r"ingr[ée]dients?\sbio[\s.,)]", r"agriculture ue/non ue biologique", r"agriculture bio(?:logique)?[\s.,)]", r"production bio(?:logique)?[\s.,)]", ] LABELS_REGEX = { "en:organic": [ OCRRegex( re.compile(r"|".join( [r"(?:{})".format(x) for x in EN_ORGANIC_REGEX_STR])), field=OCRField.full_text_contiguous, lowercase=True, ), ], "xx-bio-xx": [ # The negative lookbehind (?<![a-zA-Z]) is useful to avoid to match # strings if additional chars are before the label OCRRegex( re.compile( r"(?<![a-zA-Z])([A-Z]{2})[\-\s.](BIO|ÖKO|OKO|EKO|ØKO|ORG|Bio)[\-\s.](\d{2,3})" ), field=OCRField.text_annotations, lowercase=False, processing_func=process_eu_bio_label_code, ), # Spain specific regex
format_str: str = "%d/%m/%y" else: format_str = "%d/%m/%Y" try: date = datetime.datetime.strptime("{}/{}/{}".format(day, month, year), format_str).date() except ValueError: return None return date EXPIRATION_DATE_REGEX: Dict[str, OCRRegex] = { 'full_digits_short': OCRRegex(re.compile(r'(?<!\d)(\d{2})[-./](\d{2})[-./](\d{2})(?!\d)'), field=OCRField.full_text, lowercase=False, processing_func=functools.partial(process_full_digits_expiration_date, short=True)), 'full_digits_long': OCRRegex(re.compile(r'(?<!\d)(\d{2})[-./](\d{2})[-./](\d{4})(?!\d)'), field=OCRField.full_text, lowercase=False, processing_func=functools.partial(process_full_digits_expiration_date, short=False)), } def find_expiration_date(ocr_result: OCRResult) -> List[Dict]: # Parse expiration date # "À consommer de préférence avant", results = []
city_code, company_code = match.group(1, 2) city_code = city_code.replace(" ", "") company_code = company_code or "" return "EMB {}{}".format(city_code, company_code).upper() def process_fsc_match(match) -> str: fsc_code = match.group(1) return "FSC-{}".format(fsc_code).upper() PACKAGER_CODE: Dict[str, OCRRegex] = { "fr_emb": OCRRegex( re.compile(r"emb ?(\d ?\d ?\d ?\d ?\d) ?([a-z])?(?![a-z0-9])"), field=OCRField.text_annotations, lowercase=True, processing_func=process_fr_emb_match, ), "fsc": OCRRegex( re.compile(r"fsc.? ?(c\d{6})"), field=OCRField.text_annotations, lowercase=True, processing_func=process_fsc_match, ), "eu_fr": OCRRegex( re.compile( r"fr (\d{2,3}|2[ab])[\-\s.](\d{3})[\-\s.](\d{3}) (ce|ec)(?![a-z0-9])" ), field=OCRField.full_text_contiguous,
) ) def generate_nutrient_mention_regex(nutrient_mentions: List[NutrientMentionType]): sub_re = "|".join( r"(?P<{}>{})".format("{}_{}".format("_".join(lang), i), name) for i, (name, lang) in enumerate(nutrient_mentions) ) return re.compile(r"(?<!\w){}(?!\w)".format(sub_re)) NUTRIENT_VALUES_REGEX = { nutrient: OCRRegex( generate_nutrient_regex(NUTRIENT_MENTION[nutrient], units), field=OCRField.full_text_contiguous, lowercase=True, ) for nutrient, units in NUTRIENT_UNITS.items() } NUTRIENT_MENTIONS_REGEX: Dict[str, OCRRegex] = { nutrient: OCRRegex( generate_nutrient_mention_regex(NUTRIENT_MENTION[nutrient]), field=OCRField.full_text_contiguous, lowercase=True, ) for nutrient in NUTRIENT_MENTION }
if is_suspicious_weight(normalized_value, normalized_unit): # Don't process the insight automatically if the value # is suspiciously high result["automatic_processing"] = False return result PRODUCT_WEIGHT_REGEX: Dict[str, OCRRegex] = { "with_mention": OCRRegex( re.compile( r"(?<![a-z])(poids|poids net [aà] l'emballage|poids net|poids net égoutté|masse nette|volume net total|net weight|net wt\.?|peso neto|peso liquido|netto[ -]?gewicht)\s?:?\s?([0-9]+[,.]?[0-9]*)\s?(fl oz|dle?|cle?|mge?|mle?|lbs|oz|ge?|kge?|le?)(?![a-z])" ), field=OCRField.full_text_contiguous, lowercase=True, processing_func=functools.partial(process_product_weight, prompt=True), priority=1, ), "with_ending_mention": OCRRegex( re.compile( r"(?<![a-z])([0-9]+[,.]?[0-9]*)\s?(fl oz|dle?|cle?|mge?|mle?|lbs|oz|ge?|kge?|le?)\s(net)(?![a-z])" ), field=OCRField.full_text_contiguous, lowercase=True, processing_func=functools.partial(process_product_weight, prompt=True, ending_prompt=True), priority=1,
import re from typing import List, Dict from robotoff.insights.ocr.dataclass import OCRRegex, OCRField, OCRResult TRACES_REGEX = OCRRegex(re.compile( r"(?:possibilit[ée] de traces|peut contenir(?: des traces)?|traces? [ée]ventuelles? de)" ), field=OCRField.full_text_contiguous, lowercase=True) def find_traces(ocr_result: OCRResult) -> List[Dict]: results = [] text = ocr_result.get_text(TRACES_REGEX) if not text: return [] for match in TRACES_REGEX.regex.finditer(text): raw = match.group() end_idx = match.end() captured = text[end_idx:end_idx + 100] result = { 'raw': raw, 'text': captured, 'notify': TRACES_REGEX.notify, } results.append(result)
nutrient_names_str = "|".join(nutrient_names) units_str = "|".join(units) return re.compile( r"(?<!\w)({}) ?(?:[:-] ?)?([0-9]+[,.]?[0-9]*) ?({})(?!\w)".format( nutrient_names_str, units_str)) NUTRIENT_VALUES_REGEX = { "energy": OCRRegex( generate_nutrient_regex( [ "[ée]nergie", # fr/de "energy", # en "calories", # fr/en "energia", # es "valor energ[ée]tico", # es ], ["kj", "kcal"], ), field=OCRField.full_text_contiguous, lowercase=True, ), "saturated_fat": OCRRegex( generate_nutrient_regex( [ "mati[èe]res? grasses? satur[ée]s?", # fr "acides? gras satur[ée]s?", # fr "saturated fat", # en "of which saturates", # en "verzadigde vetzuren", # nl
from robotoff.insights.ocr.dataclass import OCRResult, OCRRegex, OCRField def generate_nutrient_regex(nutrient_names: List[str], units: List[str]): nutrient_names_str = '|'.join(nutrient_names) units_str = '|'.join(units) return re.compile( r"(?<!\w)({}) ?(?:[:-] ?)?([0-9]+[,.]?[0-9]*) ?({})(?!\w)".format( nutrient_names_str, units_str)) NUTRIENT_VALUES_REGEX = { 'energy': OCRRegex(generate_nutrient_regex(["[ée]nergie", "energy", "calories"], ["kj", "kcal"]), field=OCRField.full_text_contiguous, lowercase=True), 'saturated_fat': OCRRegex(generate_nutrient_regex( ["mati[èe]res? grasses? satur[ée]s?", "saturated fat"], ["g"]), field=OCRField.full_text_contiguous, lowercase=True), 'trans_fat': OCRRegex(generate_nutrient_regex( ["mati[èe]res? grasses? trans", "trans fat"], ["g"]), field=OCRField.full_text_contiguous, lowercase=True), 'fat': OCRRegex(generate_nutrient_regex(["mati[èe]res? grasses?", "total fat"], ["g"]), field=OCRField.full_text_contiguous,
import re from typing import List, Dict from robotoff.insights.ocr.dataclass import OCRResult, OCRRegex, OCRField def generate_nutrient_regex(nutrient_names: List[str], units: List[str]): nutrient_names_str = '|'.join(nutrient_names) units_str = '|'.join(units) return re.compile(r"({}) ?(?:[:-] ?)?([0-9]+[,.]?[0-9]*) ?({})".format(nutrient_names_str, units_str)) NUTRIENT_VALUES_REGEX = { 'energy': OCRRegex( generate_nutrient_regex(["[ée]nergie", "energy"], ["kj", "kcal"]), field=OCRField.full_text_contiguous, lowercase=True), 'fat': OCRRegex( generate_nutrient_regex(["mati[èe]res? grasses?"], ["g"]), field=OCRField.full_text_contiguous, lowercase=True), 'glucid': OCRRegex( generate_nutrient_regex(["glucides?", "glucids?"], ["g"]), field=OCRField.full_text_contiguous, lowercase=True), 'carbohydrate': OCRRegex( generate_nutrient_regex(["sucres?", "carbohydrates?"], ["g"]), field=OCRField.full_text_contiguous, lowercase=True), }
from robotoff.insights.ocr.utils import generate_keyword_processor from robotoff.utils import text_file_iter from robotoff.utils.cache import CachedStore def generate_trace_keyword_processor(labels: Optional[List[str]] = None): if labels is None: labels = list(text_file_iter(settings.OCR_TRACE_ALLERGEN_DATA_PATH)) return generate_keyword_processor(labels) TRACES_REGEX = OCRRegex( re.compile( r"(?:possibilit[ée] de traces|conditionné dans un atelier qui manipule|peut contenir(?: des traces)?|traces? [ée]ventuelles? d[e']|traces? d[e']|may contain)" ), field=OCRField.full_text_contiguous, lowercase=True, ) TRACE_KEYWORD_PROCESSOR_STORE = CachedStore( fetch_func=generate_trace_keyword_processor, expiration_interval=None ) def find_traces(content: Union[OCRResult, str]) -> List[RawInsight]: insights = [] text = get_text(content, TRACES_REGEX) if not text: