def test_check_ocr_brands(): brands: Set[str] = set() items: Set[str] = set() for item in text_file_iter(settings.OCR_BRANDS_DATA_PATH): assert item not in items items.add(item) assert '’' not in item if '||' in item: brand, regex_str = item.split('||') else: brand = item regex_str = re.escape(item.lower()) assert brand not in brands re.compile(regex_str) brands.add(brand) items = set() for item in text_file_iter(settings.OCR_BRANDS_NOTIFY_DATA_PATH): assert item in brands assert item not in items items.add(item)
def test_check_logo_annotation_brands(): items: Set[str] = set() for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_BRANDS_DATA_PATH): assert "||" in item assert item not in items items.add(item)
def generate_packaging_keyword_processor(packaging: Optional[List[str]] = None): p = ( text_file_iter(settings.OCR_PACKAGING_DATA_PATH) if packaging is None else packaging ) return generate_keyword_processor(p)
def generate_image_flag_keyword_processor() -> KeywordProcessor: processor = KeywordProcessor() for key, file_path in ( ("beauty", settings.OCR_IMAGE_FLAG_BEAUTY_PATH), ("miscellaneous", settings.OCR_IMAGE_FLAG_MISCELLANEOUS_PATH), ): for name in text_file_iter(file_path): processor.add_keyword(name, clean_name=(name, key)) return processor
def get_logo_annotation_brands() -> Dict[str, str]: brands: Dict[str, str] = {} for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_BRANDS_DATA_PATH): if "||" in item: logo_description, label_tag = item.split("||") else: logger.warn("'||' separator expected!") continue brands[logo_description] = label_tag return brands
def get_sorted_stores() -> List[Tuple[str, str]]: sorted_stores: Dict[str, str] = {} for item in text_file_iter(settings.OCR_STORES_DATA_PATH): if '||' in item: store, regex_str = item.split('||') else: store = item regex_str = re.escape(item.lower()) sorted_stores[store] = regex_str return sorted(sorted_stores.items(), key=store_sort_key)
def get_logo_annotation_labels() -> Dict[str, str]: labels: Dict[str, str] = {} for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_LABELS_DATA_PATH): if '||' in item: logo_description, label_tag = item.split('||') else: logger.warn("'||' separator expected!") continue labels[logo_description] = label_tag return labels
def get_sorted_brands() -> List[Tuple[str, str]]: sorted_brands: Dict[str, str] = {} for item in text_file_iter(settings.OCR_BRANDS_DATA_PATH): if '||' in item: brand, regex_str = item.split('||') else: brand = item regex_str = re.escape(item.lower()) sorted_brands[brand] = regex_str return sorted(sorted_brands.items(), key=brand_sort_key)
def test_check_ocr_stores(): stores: Set[str] = set() items: Set[str] = set() for item in text_file_iter(settings.OCR_STORES_DATA_PATH): assert item not in items items.add(item) assert '’' not in item if '||' in item: store, regex_str = item.split('||') else: store = item regex_str = re.escape(item.lower()) re.compile(regex_str) stores.add(store) items = set() for item in text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH): assert item in stores assert item not in items items.add(item)
def test_packaging_format(): patterns = set() items = list(text_file_iter(settings.OCR_PACKAGING_DATA_PATH)) for item in items: assert "||" in item, f"missing || separator for item {item}" splitted = item.split("||") assert len( splitted) == 2, f"key||pattern format expected, here: {item}" key, pattern = splitted assert not any( x.startswith(" ") or x.endswith(" ") for x in key.split(";")), f"space after ';' separator: {item}" pattern = pattern.lower() assert pattern not in patterns, f"duplicated pattern: {pattern}" patterns.add(pattern)
if '||' in item: store, regex_str = item.split('||') else: store = item regex_str = re.escape(item.lower()) sorted_stores[store] = regex_str return sorted(sorted_stores.items(), key=store_sort_key) SORTED_STORES = get_sorted_stores() STORE_REGEX_STR = "|".join(r"((?<!\w){}(?!\w))".format(pattern) for _, pattern in SORTED_STORES) NOTIFY_STORES: Set[str] = set( text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH)) STORE_REGEX = OCRRegex(re.compile(STORE_REGEX_STR), field=OCRField.full_text_contiguous, lowercase=True) def find_stores(ocr_result: OCRResult) -> List[Dict]: results = [] text = ocr_result.get_text(STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups()
if '||' in item: brand, regex_str = item.split('||') else: brand = item regex_str = re.escape(item.lower()) sorted_brands[brand] = regex_str return sorted(sorted_brands.items(), key=brand_sort_key) SORTED_BRANDS = get_sorted_brands() BRAND_REGEX_STR = "|".join(r"((?<!\w){}(?!\w))".format(pattern) for _, pattern in SORTED_BRANDS) NOTIFY_BRANDS_WHITELIST: Set[str] = set( text_file_iter(settings.OCR_BRANDS_NOTIFY_WHITELIST_DATA_PATH)) BRAND_REGEX = OCRRegex(re.compile(BRAND_REGEX_STR), field=OCRField.full_text_contiguous, lowercase=True) def find_brands(ocr_result: OCRResult) -> List[Dict]: results = [] text = ocr_result.get_text(BRAND_REGEX) if not text: return [] for match in BRAND_REGEX.regex.finditer(text): groups = match.groups()
for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_BRANDS_DATA_PATH): if "||" in item: logo_description, label_tag = item.split("||") else: logger.warn("'||' separator expected!") continue brands[logo_description] = label_tag return brands LOGO_ANNOTATION_BRANDS: Dict[str, str] = get_logo_annotation_brands() TAXONOMY_BRAND_PROCESSOR = generate_brand_keyword_processor( text_file_iter(settings.OCR_TAXONOMY_BRANDS_PATH)) BRAND_PROCESSOR = generate_brand_keyword_processor( text_file_iter(settings.OCR_BRANDS_PATH), ) def extract_brands(processor: KeywordProcessor, text: str, data_source_name: str) -> List[RawInsight]: insights = [] for (brand_tag, brand), span_start, span_end in processor.extract_keywords( text, span_info=True): match_str = text[span_start:span_end] insights.append( RawInsight( type=InsightType.brand, value=brand,
def generate_fishing_code_keyword_processor() -> KeywordProcessor: codes = text_file_iter(settings.OCR_FISHING_FLASHTEXT_DATA_PATH) return generate_keyword_processor( ("{}||{}".format(c.upper(), c) for c in codes))
def brand_keyword_processor(): yield generate_brand_keyword_processor( text_file_iter(settings.OCR_BRANDS_PATH))
def generate_label_keyword_processor(labels: Optional[List[str]] = None): if labels is None: labels = text_file_iter(settings.OCR_LABEL_FLASHTEXT_DATA_PATH) return generate_keyword_processor(labels)
def load_authorized_labels() -> Set[str]: return set(text_file_iter(settings.OCR_LABEL_WHITELIST_DATA_PATH))
def generate_trace_keyword_processor(labels: Optional[List[str]] = None): if labels is None: labels = list(text_file_iter(settings.OCR_TRACE_ALLERGEN_DATA_PATH)) return generate_keyword_processor(labels)
if "||" in item: store, regex_str = item.split("||") else: store = item regex_str = re.escape(item.lower()) sorted_stores[store] = regex_str return sorted(sorted_stores.items(), key=store_sort_key) SORTED_STORES = get_sorted_stores() STORE_REGEX_STR = "|".join( r"((?<!\w){}(?!\w))".format(pattern) for _, pattern in SORTED_STORES ) NOTIFY_STORES: Set[str] = set(text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH)) STORE_REGEX = OCRRegex( re.compile(STORE_REGEX_STR), field=OCRField.full_text_contiguous, lowercase=True ) def find_stores(content: Union[OCRResult, str]) -> List[Prediction]: results = [] text = get_text(content, STORE_REGEX) if not text: return [] for match in STORE_REGEX.regex.finditer(text): groups = match.groups()
def inner_fn() -> AbstractSet[str]: return set( cls.normalize(token) for token in text_file_iter(voc_path))
def get_brand_blacklist() -> Set[str]: return set(text_file_iter(settings.OCR_TAXONOMY_BRANDS_BLACKLIST_PATH))
def get_fr_known_tokens() -> Set[str]: tokens = set(text_file_iter(INGREDIENT_TOKENS_PATH, comment=False)) tokens = tokens.union(set(text_file_iter(FR_TOKENS_PATH, comment=False))) return tokens
def load_category_blacklist() -> Set[str]: return set(text_file_iter(settings.CATEGORY_CLF_CATEGORY_BLACKLIST))
def brand_taxonomy_keyword_processor(): yield generate_brand_keyword_processor( text_file_iter(settings.OCR_TAXONOMY_BRANDS_PATH))