Ejemplo n.º 1
0
def test_check_ocr_brands():
    brands: Set[str] = set()
    items: Set[str] = set()

    for item in text_file_iter(settings.OCR_BRANDS_DATA_PATH):
        assert item not in items
        items.add(item)

        assert '’' not in item
        if '||' in item:
            brand, regex_str = item.split('||')
        else:
            brand = item
            regex_str = re.escape(item.lower())

        assert brand not in brands
        re.compile(regex_str)

        brands.add(brand)

    items = set()
    for item in text_file_iter(settings.OCR_BRANDS_NOTIFY_DATA_PATH):
        assert item in brands
        assert item not in items
        items.add(item)
Ejemplo n.º 2
0
def test_check_logo_annotation_brands():
    items: Set[str] = set()

    for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_BRANDS_DATA_PATH):
        assert "||" in item
        assert item not in items
        items.add(item)
Ejemplo n.º 3
0
def generate_packaging_keyword_processor(packaging: Optional[List[str]] = None):
    p = (
        text_file_iter(settings.OCR_PACKAGING_DATA_PATH)
        if packaging is None
        else packaging
    )
    return generate_keyword_processor(p)
Ejemplo n.º 4
0
def generate_image_flag_keyword_processor() -> KeywordProcessor:
    processor = KeywordProcessor()

    for key, file_path in (
        ("beauty", settings.OCR_IMAGE_FLAG_BEAUTY_PATH),
        ("miscellaneous", settings.OCR_IMAGE_FLAG_MISCELLANEOUS_PATH),
    ):
        for name in text_file_iter(file_path):
            processor.add_keyword(name, clean_name=(name, key))

    return processor
Ejemplo n.º 5
0
def get_logo_annotation_brands() -> Dict[str, str]:
    brands: Dict[str, str] = {}

    for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_BRANDS_DATA_PATH):
        if "||" in item:
            logo_description, label_tag = item.split("||")
        else:
            logger.warn("'||' separator expected!")
            continue

        brands[logo_description] = label_tag

    return brands
Ejemplo n.º 6
0
def get_sorted_stores() -> List[Tuple[str, str]]:
    sorted_stores: Dict[str, str] = {}

    for item in text_file_iter(settings.OCR_STORES_DATA_PATH):
        if '||' in item:
            store, regex_str = item.split('||')
        else:
            store = item
            regex_str = re.escape(item.lower())

        sorted_stores[store] = regex_str

    return sorted(sorted_stores.items(), key=store_sort_key)
Ejemplo n.º 7
0
def get_logo_annotation_labels() -> Dict[str, str]:
    labels: Dict[str, str] = {}

    for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_LABELS_DATA_PATH):
        if '||' in item:
            logo_description, label_tag = item.split('||')
        else:
            logger.warn("'||' separator expected!")
            continue

        labels[logo_description] = label_tag

    return labels
Ejemplo n.º 8
0
def get_sorted_brands() -> List[Tuple[str, str]]:
    sorted_brands: Dict[str, str] = {}

    for item in text_file_iter(settings.OCR_BRANDS_DATA_PATH):
        if '||' in item:
            brand, regex_str = item.split('||')
        else:
            brand = item
            regex_str = re.escape(item.lower())

        sorted_brands[brand] = regex_str

    return sorted(sorted_brands.items(), key=brand_sort_key)
Ejemplo n.º 9
0
def test_check_ocr_stores():
    stores: Set[str] = set()
    items: Set[str] = set()

    for item in text_file_iter(settings.OCR_STORES_DATA_PATH):
        assert item not in items
        items.add(item)

        assert '’' not in item
        if '||' in item:
            store, regex_str = item.split('||')
        else:
            store = item
            regex_str = re.escape(item.lower())

        re.compile(regex_str)
        stores.add(store)

    items = set()
    for item in text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH):
        assert item in stores
        assert item not in items
        items.add(item)
Ejemplo n.º 10
0
def test_packaging_format():
    patterns = set()
    items = list(text_file_iter(settings.OCR_PACKAGING_DATA_PATH))
    for item in items:
        assert "||" in item, f"missing || separator for item {item}"
        splitted = item.split("||")
        assert len(
            splitted) == 2, f"key||pattern format expected, here: {item}"
        key, pattern = splitted
        assert not any(
            x.startswith(" ") or x.endswith(" ")
            for x in key.split(";")), f"space after ';' separator: {item}"
        pattern = pattern.lower()
        assert pattern not in patterns, f"duplicated pattern: {pattern}"
        patterns.add(pattern)
Ejemplo n.º 11
0
        if '||' in item:
            store, regex_str = item.split('||')
        else:
            store = item
            regex_str = re.escape(item.lower())

        sorted_stores[store] = regex_str

    return sorted(sorted_stores.items(), key=store_sort_key)


SORTED_STORES = get_sorted_stores()
STORE_REGEX_STR = "|".join(r"((?<!\w){}(?!\w))".format(pattern)
                           for _, pattern in SORTED_STORES)
NOTIFY_STORES: Set[str] = set(
    text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH))
STORE_REGEX = OCRRegex(re.compile(STORE_REGEX_STR),
                       field=OCRField.full_text_contiguous,
                       lowercase=True)


def find_stores(ocr_result: OCRResult) -> List[Dict]:
    results = []

    text = ocr_result.get_text(STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()
Ejemplo n.º 12
0
        if '||' in item:
            brand, regex_str = item.split('||')
        else:
            brand = item
            regex_str = re.escape(item.lower())

        sorted_brands[brand] = regex_str

    return sorted(sorted_brands.items(), key=brand_sort_key)


SORTED_BRANDS = get_sorted_brands()
BRAND_REGEX_STR = "|".join(r"((?<!\w){}(?!\w))".format(pattern)
                           for _, pattern in SORTED_BRANDS)
NOTIFY_BRANDS_WHITELIST: Set[str] = set(
    text_file_iter(settings.OCR_BRANDS_NOTIFY_WHITELIST_DATA_PATH))
BRAND_REGEX = OCRRegex(re.compile(BRAND_REGEX_STR),
                       field=OCRField.full_text_contiguous,
                       lowercase=True)


def find_brands(ocr_result: OCRResult) -> List[Dict]:
    results = []

    text = ocr_result.get_text(BRAND_REGEX)

    if not text:
        return []

    for match in BRAND_REGEX.regex.finditer(text):
        groups = match.groups()
Ejemplo n.º 13
0
    for item in text_file_iter(settings.OCR_LOGO_ANNOTATION_BRANDS_DATA_PATH):
        if "||" in item:
            logo_description, label_tag = item.split("||")
        else:
            logger.warn("'||' separator expected!")
            continue

        brands[logo_description] = label_tag

    return brands


LOGO_ANNOTATION_BRANDS: Dict[str, str] = get_logo_annotation_brands()
TAXONOMY_BRAND_PROCESSOR = generate_brand_keyword_processor(
    text_file_iter(settings.OCR_TAXONOMY_BRANDS_PATH))
BRAND_PROCESSOR = generate_brand_keyword_processor(
    text_file_iter(settings.OCR_BRANDS_PATH), )


def extract_brands(processor: KeywordProcessor, text: str,
                   data_source_name: str) -> List[RawInsight]:
    insights = []

    for (brand_tag, brand), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        insights.append(
            RawInsight(
                type=InsightType.brand,
                value=brand,
Ejemplo n.º 14
0
def generate_fishing_code_keyword_processor() -> KeywordProcessor:
    codes = text_file_iter(settings.OCR_FISHING_FLASHTEXT_DATA_PATH)
    return generate_keyword_processor(
        ("{}||{}".format(c.upper(), c) for c in codes))
Ejemplo n.º 15
0
def brand_keyword_processor():
    yield generate_brand_keyword_processor(
        text_file_iter(settings.OCR_BRANDS_PATH))
Ejemplo n.º 16
0
def generate_label_keyword_processor(labels: Optional[List[str]] = None):
    if labels is None:
        labels = text_file_iter(settings.OCR_LABEL_FLASHTEXT_DATA_PATH)

    return generate_keyword_processor(labels)
Ejemplo n.º 17
0
def load_authorized_labels() -> Set[str]:
    return set(text_file_iter(settings.OCR_LABEL_WHITELIST_DATA_PATH))
Ejemplo n.º 18
0
def generate_trace_keyword_processor(labels: Optional[List[str]] = None):
    if labels is None:
        labels = list(text_file_iter(settings.OCR_TRACE_ALLERGEN_DATA_PATH))

    return generate_keyword_processor(labels)
Ejemplo n.º 19
0
        if "||" in item:
            store, regex_str = item.split("||")
        else:
            store = item
            regex_str = re.escape(item.lower())

        sorted_stores[store] = regex_str

    return sorted(sorted_stores.items(), key=store_sort_key)


SORTED_STORES = get_sorted_stores()
STORE_REGEX_STR = "|".join(
    r"((?<!\w){}(?!\w))".format(pattern) for _, pattern in SORTED_STORES
)
NOTIFY_STORES: Set[str] = set(text_file_iter(settings.OCR_STORES_NOTIFY_DATA_PATH))
STORE_REGEX = OCRRegex(
    re.compile(STORE_REGEX_STR), field=OCRField.full_text_contiguous, lowercase=True
)


def find_stores(content: Union[OCRResult, str]) -> List[Prediction]:
    results = []

    text = get_text(content, STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()
Ejemplo n.º 20
0
 def inner_fn() -> AbstractSet[str]:
     return set(
         cls.normalize(token) for token in text_file_iter(voc_path))
Ejemplo n.º 21
0
def get_brand_blacklist() -> Set[str]:
    return set(text_file_iter(settings.OCR_TAXONOMY_BRANDS_BLACKLIST_PATH))
Ejemplo n.º 22
0
def get_fr_known_tokens() -> Set[str]:
    tokens = set(text_file_iter(INGREDIENT_TOKENS_PATH, comment=False))
    tokens = tokens.union(set(text_file_iter(FR_TOKENS_PATH, comment=False)))
    return tokens
Ejemplo n.º 23
0
def load_category_blacklist() -> Set[str]:
    return set(text_file_iter(settings.CATEGORY_CLF_CATEGORY_BLACKLIST))
Ejemplo n.º 24
0
def brand_taxonomy_keyword_processor():
    yield generate_brand_keyword_processor(
        text_file_iter(settings.OCR_TAXONOMY_BRANDS_PATH))