Exemple #1
0
from pymongo import MongoClient

from robotoff.utils.cache import CachedStore
from robotoff import settings


def get_mongo_client() -> MongoClient:
    return MongoClient(settings.MONGO_URI)


MONGO_CLIENT_CACHE = CachedStore(get_mongo_client, expiration_interval=None)
Exemple #2
0
    for (label_tag, label), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        insights.append({
            "label_tag": label_tag,
            "text": match_str,
            "data_source": "flashtext",
            "notify": False,
        })

    return insights


LOGO_ANNOTATION_LABELS: Dict[str, str] = get_logo_annotation_labels()
LABEL_KEYWORD_PROCESSOR_STORE = CachedStore(
    fetch_func=generate_label_keyword_processor, expiration_interval=None)


def find_labels(content: Union[OCRResult, str]) -> List[Dict]:
    insights = []

    for label_tag, regex_list in LABELS_REGEX.items():
        for ocr_regex in regex_list:
            text = get_text(content, ocr_regex)

            if not text:
                continue

            for match in ocr_regex.regex.finditer(text):
                if ocr_regex.processing_func:
                    label_value = ocr_regex.processing_func(match)
Exemple #3
0

def in_barcode_range(brand_prefix: Set[Tuple[str, str]], brand_tag: str,
                     barcode: str) -> bool:
    """Check that the insight barcode is in the range of the detected
    brand barcode range.
    Return True if the check passes, False otherwise
    """
    if len(barcode) == 13:
        barcode_prefix = generate_barcode_prefix(barcode)
        key = (brand_tag, barcode_prefix)

        if key not in brand_prefix:
            return False

    return True


BRAND_PREFIX_STORE = CachedStore(fetch_func=get_brand_prefix,
                                 expiration_interval=None)
BRAND_BLACKLIST_STORE = CachedStore(fetch_func=get_brand_blacklist,
                                    expiration_interval=None)

if __name__ == "__main__":
    blacklisted_brands = get_brand_blacklist()
    dump_taxonomy_brands(
        threshold=settings.BRAND_MATCHING_MIN_COUNT,
        min_length=settings.BRAND_MATCHING_MIN_LENGTH,
        blacklisted_brands=blacklisted_brands,
    )
Exemple #4
0
        for predictions, product in zip(predictions_batch, product_batch):
            for insight in format_predictions(product, predictions, "xx"):
                yield insight


def format_predictions(product: Dict, predictions: List[CategoryPrediction],
                       lang: str) -> List[Dict]:
    formatted_predictions = []

    for category, confidence in predictions:
        formatted = {
            "barcode": product["code"],
            "category": category,
            "lang": lang,
            "model": "neural",
            "confidence": confidence,
        }
        formatted_predictions.append(formatted)

    return formatted_predictions


def filter_blacklisted_categories(
    predictions: List[CategoryPrediction], ) -> List[CategoryPrediction]:
    category_blacklist: Set[str] = CATEGORY_BLACKLIST_STORE.get()
    return [(category, confidence) for (category, confidence) in predictions
            if category not in category_blacklist]


CATEGORY_BLACKLIST_STORE = CachedStore(load_category_blacklist)
    for (key, _), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        insights.append(
            RawInsight(
                type=InsightType.packager_code,
                value=key,
                predictor="flashtext",
                data={
                    "type": "fishing",
                    "raw": match_str,
                    "notify": False
                },
                automatic_processing=True,
            ))

    return insights


FISHING_KEYWORD_PROCESSOR_STORE = CachedStore(
    fetch_func=generate_fishing_code_keyword_processor,
    expiration_interval=None)


def find_packager_codes(ocr_result: Union[OCRResult, str]) -> List[RawInsight]:
    insights = find_packager_codes_regex(ocr_result)
    processor = FISHING_KEYWORD_PROCESSOR_STORE.get()
    text = get_text(ocr_result)
    insights += extract_fishing_code(processor, text)
    return insights
Exemple #6
0
    def __getitem__(self, barcode: str) -> Optional[Product]:
        product = self.get_product(barcode)

        if product:
            return Product(product)

        return None

    def __iter__(self):
        raise NotImplementedError("cannot iterate over database product store")


def load_min_dataset() -> ProductStore:
    ps = MemoryProductStore.load_min()
    logger.info("product store loaded ({} items)".format(len(ps)))
    return ps


def get_product_store() -> DBProductStore:
    mongo_client = MONGO_CLIENT_CACHE.get()
    return DBProductStore(client=mongo_client)


def get_product(barcode: str,
                projection: Optional[List[str]] = None) -> Optional[JSONType]:
    mongo_client = MONGO_CLIENT_CACHE.get()
    return mongo_client.off.products.find_one({"code": barcode}, projection)


CACHED_PRODUCT_STORE = CachedStore(load_min_dataset)
Exemple #7
0
        data = r.json()
    except Exception as e:
        logger.exception(f"{type(e)} exception while fetching taxonomy at %s", url)
        if fallback_path:
            return Taxonomy.from_json(fallback_path)
        else:
            return None

    return Taxonomy.from_dict(data)


TAXONOMY_STORES: Dict[str, CachedStore] = {
    TaxonomyType.category.name: CachedStore(
        functools.partial(
            fetch_taxonomy,
            url=settings.TAXONOMY_CATEGORY_URL,
            fallback_path=settings.TAXONOMY_CATEGORY_PATH,
        )
    ),
    TaxonomyType.ingredient.name: CachedStore(
        functools.partial(
            fetch_taxonomy,
            url=settings.TAXONOMY_INGREDIENT_URL,
            fallback_path=settings.TAXONOMY_INGREDIENT_PATH,
        )
    ),
    TaxonomyType.label.name: CachedStore(
        functools.partial(
            fetch_taxonomy,
            url=settings.TAXONOMY_LABEL_URL,
            fallback_path=settings.TAXONOMY_LABEL_PATH,
Exemple #8
0
    try:
        r = requests.get(url, timeout=5)
        data = r.json()
    except Exception:
        if fallback_path:
            return Taxonomy.from_json(fallback_path)
        else:
            return None

    return Taxonomy.from_dict(data)


TAXONOMY_STORES: Dict[str, CachedStore] = {
    TaxonomyType.category.name:
    CachedStore(
        functools.partial(fetch_taxonomy,
                          url=settings.TAXONOMY_CATEGORY_URL,
                          fallback_path=settings.TAXONOMY_CATEGORY_PATH)),
    TaxonomyType.ingredient.name:
    CachedStore(
        functools.partial(fetch_taxonomy,
                          url=settings.TAXONOMY_INGREDIENT_URL,
                          fallback_path=settings.TAXONOMY_INGREDIENT_PATH)),
    TaxonomyType.label.name:
    CachedStore(
        functools.partial(fetch_taxonomy,
                          url=settings.TAXONOMY_LABEL_URL,
                          fallback_path=settings.TAXONOMY_LABEL_PATH))
}
Exemple #9
0
from robotoff.insights.ocr.utils import generate_keyword_processor

from robotoff.insights.ocr.utils import get_tag
from robotoff.utils import text_file_iter
from robotoff.utils.cache import CachedStore


def generate_packaging_keyword_processor(
        packaging: Optional[List[str]] = None):
    if packaging is None:
        packaging = text_file_iter(settings.OCR_PACKAGING_DATA_PATH)

    return generate_keyword_processor(packaging)


KEYWORD_PROCESSOR_STORE = CachedStore(
    fetch_func=generate_packaging_keyword_processor, expiration_interval=None)


def find_packaging(content: Union[OCRResult, str]) -> List[Dict]:
    insights = []

    text = get_text(content)

    if not text:
        return []

    processor = KEYWORD_PROCESSOR_STORE.get()

    for (packaging_str, _), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        packagings = packaging_str.split(";")
Exemple #10
0
from robotoff.models import batch_insert, ProductInsight
from robotoff.off import get_server_type
from robotoff.products import get_product_store, is_valid_image, Product, ProductStore
from robotoff.taxonomy import get_taxonomy, Taxonomy, TaxonomyNode
from robotoff.utils import get_logger, text_file_iter
from robotoff.utils.cache import CachedStore
from robotoff.utils.types import JSONType

logger = get_logger(__name__)


def load_authorized_labels() -> Set[str]:
    return set(text_file_iter(settings.OCR_LABEL_WHITELIST_DATA_PATH))


AUTHORIZED_LABELS_STORE = CachedStore(load_authorized_labels,
                                      expiration_interval=None)


def generate_seen_set_query(insight_type: InsightType, barcode: str,
                            server_domain: str):
    return ProductInsight.select(
        ProductInsight.value, ProductInsight.value_tag).where(
            ProductInsight.type == insight_type.name,
            ProductInsight.latent == False,  # noqa: E712
            ProductInsight.barcode == barcode,
            ProductInsight.server_domain == server_domain,
        )


def is_reserved_barcode(barcode: str) -> bool:
    if barcode.startswith("0"):
Exemple #11
0
            return None
        pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)".format(city.postal_code)

        sub_start = max(0, city_start - self.postal_code_search_distance)
        sub_end = min(len(text), city_end + self.postal_code_search_distance)
        sub_text = text[sub_start:sub_end]

        match = re.search(pattern, sub_text)
        if match is None:
            return None
        else:
            return match.group(
                1), sub_start + match.start(1), sub_start + match.end(1)


ADDRESS_EXTRACTOR_STORE = CachedStore(
    lambda: AddressExtractor(load_cities_fr()), expiration_interval=None)


def find_locations(content: Union[OCRResult, str]) -> List[RawInsight]:
    """Find location insights in the text content.

    See :class:`.AddressExtractor`.

    Args:
        content (OCRResult or str): The content to be searched for locations.

    Returns:
        list of RawInsight: See :meth:`.AddressExtractor.extract_addresses`.
    """
    location_extractor: AddressExtractor = ADDRESS_EXTRACTOR_STORE.get()
    return location_extractor.extract_addresses(content)
Exemple #12
0
        logger.info("Loading product store")
        ds = ProductDataset(path)
        stream = ds.stream()

        seen = set()
        for product in stream.iter_product():
            if product.barcode:
                seen.add(product.barcode)
                self.store[product.barcode] = product

        if reset:
            for key in set(self.store.keys()).difference(seen):
                self.store.pop(key)

        logger.info("product store loaded ({} items added)".format(len(seen)))

    @classmethod
    def load_from_min_dataset(cls):
        product_store = ProductStore()
        product_store.load(settings.JSONL_MIN_DATASET_PATH, False)
        return product_store

    def __getitem__(self, item) -> Optional[Product]:
        return self.store.get(item)

    def __iter__(self) -> Iterable[Product]:
        return iter(self.store.values())


CACHED_PRODUCT_STORE = CachedStore(lambda: ProductStore.load_from_min_dataset())
Exemple #13
0
def get_deaccent_cache(voc_cache: CachedStore) -> CachedStore:
    return CachedStore(Vocabulary.deaccent_tokens_fn(voc_cache.get()))
Exemple #14
0
def get_voc_cache(path: Path) -> CachedStore:
    return CachedStore(Vocabulary.load_vocabulary_fn(path))
Exemple #15
0
            if added >= k:
                break

        return predictions


class FastTextLanguageIdentifier(LanguageIdentifier):
    def __init__(self, model):
        self.model = model

    def predict(self,
                text: str,
                k: int = 10,
                threshold: float = 0.0) -> List[LanguagePrediction]:
        predictions: List[LanguagePrediction] = []
        languages, confidences = self.model.predict(text,
                                                    k=k,
                                                    threshold=threshold)

        for language, confidence in zip(languages, confidences):
            # language str format is __label__en
            language = language[9:]
            prediction = LanguagePrediction(language, confidence)
            predictions.append(prediction)

        return predictions


DEFAULT_LANGUAGE_IDENTIFIER = CachedStore(LangidLanguageIdentifier.load)
Exemple #16
0
}

LogoLabelType = Tuple[str, Optional[str]]
UNKNOWN_LABEL: LogoLabelType = ("UNKNOWN", None)


def get_logo_confidence_thresholds() -> Dict[LogoLabelType, float]:
    thresholds = {}

    for item in LogoConfidenceThreshold.select().iterator():
        thresholds[(item.type, item.value)] = item.threshold

    return thresholds


LOGO_CONFIDENCE_THRESHOLDS = CachedStore(get_logo_confidence_thresholds,
                                         expiration_interval=10)


def get_stored_logo_ids() -> Set[int]:
    r = http_session.get(
        "https://robotoff.openfoodfacts.org/api/v1/ann/stored", timeout=30)

    if not r.ok:
        logger.warning(
            f"error while fetching stored logo IDs ({r.status_code}): {r.text}"
        )
        return set()

    return set(r.json()["stored"])