def generate_dataset(stream: ProductStream, lang: Optional[str]) -> Iterator[JSONType]: category_taxonomy = get_taxonomy('category') ingredient_taxonomy = get_taxonomy('ingredient') for product in stream.iter(): categories_tags: List[str] = product['categories_tags'] inferred_categories_tags: List[TaxonomyNode] = list( infer_category_tags(categories_tags, category_taxonomy)) if inferred_categories_tags: ingredient_tags = product.get('ingredients_tags', []) ingredient_tags = [x for x in ingredient_tags if x] known_ingredient_tags = [ ingredient_tag for ingredient_tag in ingredient_tags if ingredient_tag in ingredient_taxonomy ] ingredients_text_field = 'ingredients_text_{}'.format( lang) if lang else 'ingredients_text' ingredients_text = product.get(ingredients_text_field, None) or None product_name_field = 'product_name_{}'.format( lang) if lang else 'product_name' yield { 'code': product['code'], 'product_name': product[product_name_field], 'categories_tags': [x.id for x in inferred_categories_tags], 'ingredient_tags': ingredient_tags, 'known_ingredient_tags': known_ingredient_tags, 'ingredients_text': ingredients_text, 'lang': product.get('lang', None), }
def generate_dataset(stream: ProductStream, lang: Optional[str]) -> Iterator[JSONType]: category_taxonomy = get_taxonomy("category") ingredient_taxonomy = get_taxonomy("ingredient") for product in stream.iter(): categories_tags: List[str] = product["categories_tags"] inferred_categories_tags: List[TaxonomyNode] = list( infer_category_tags(categories_tags, category_taxonomy)) if inferred_categories_tags: ingredient_tags = product.get("ingredients_tags", []) ingredient_tags = [x for x in ingredient_tags if x] known_ingredient_tags = [ ingredient_tag for ingredient_tag in ingredient_tags if ingredient_tag in ingredient_taxonomy ] ingredients_text_field = ("ingredients_text_{}".format(lang) if lang else "ingredients_text") ingredients_text = product.get(ingredients_text_field, None) or None product_name_field = ("product_name_{}".format(lang) if lang else "product_name") yield { "code": product["code"], "nutriments": product.get("nutriments") or None, "images": product.get("images", {}) or None, "product_name": product[product_name_field], "categories_tags": [x.id for x in inferred_categories_tags], "ingredient_tags": ingredient_tags, "known_ingredient_tags": known_ingredient_tags, "ingredients_text": ingredients_text, "lang": product.get("lang", None), }
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None): category_taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name) category_classifier = CategoryClassifier(category_taxonomy) dataset: ProductDataset = ProductDataset.load() train_df, test_df = category_classifier.train(dataset) category_classifier.save(str(model_output_dir)) test_metrics = category_classifier.evaluate(test_df) dataset_timestamp = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_DATASET_PATH)) meta = { 'metrics': { 'test': test_metrics, }, 'dataset_id': dataset_timestamp.date().isoformat(), 'training_set_count': len(train_df), 'test_set_count': len(test_df), } if comment: meta['comment'] = comment with open(str(model_output_dir / 'meta.json'), 'w') as f: json.dump(meta, f)
def format_question(self, insight: ProductInsight, lang: str) -> Question: value_tag: str = insight.value_tag image_url = None if value_tag in LABEL_IMAGES: image_url = LABEL_IMAGES[value_tag] taxonomy: Taxonomy = get_taxonomy(TaxonomyType.label.name) localized_value: str = taxonomy.get_localized_name(value_tag, lang) localized_question = self.translation_store.gettext( lang, self.question) source_image_url = None if insight.source_image: source_image_url = settings.OFF_IMAGE_BASE_URL + get_display_image( insight.source_image) return AddBinaryQuestion( question=localized_question, value=localized_value, value_tag=value_tag, insight=insight, image_url=image_url, source_image_url=source_image_url, )
def is_valid(self, barcode: str, label_tag: str, label_seen: Set[str]) -> bool: product = self.product_store[barcode] product_labels_tags = getattr(product, 'labels_tags', []) if label_tag in product_labels_tags: return False if label_tag in label_seen: return False # Check that the predicted label is not a parent of a # current/already predicted label label_taxonomy: Taxonomy = get_taxonomy(InsightType.label.name) if label_tag in label_taxonomy: label_node: TaxonomyNode = label_taxonomy[label_tag] to_check_labels = (set(product_labels_tags).union(label_seen)) for other_label_node in (label_taxonomy[to_check_label] for to_check_label in to_check_labels): if (other_label_node is not None and other_label_node.is_child_of(label_node)): return False return True
def is_latent(product: Optional[Product], barcode: str, tag: str, seen_set: Set[str]) -> bool: product_labels_tags = getattr(product, "labels_tags", []) if tag in product_labels_tags: return True if tag in seen_set: return True # Check that the predicted label is not a parent of a # current/already predicted label label_taxonomy: Taxonomy = get_taxonomy(InsightType.label.name) if tag in label_taxonomy: label_node: TaxonomyNode = label_taxonomy[tag] to_check_labels = set(product_labels_tags).union(seen_set) for other_label_node in (label_taxonomy[to_check_label] for to_check_label in to_check_labels): if other_label_node is not None and other_label_node.is_child_of( label_node): return True return False
def categorize( barcode: str, deepest_only: bool = False, ) -> None: """Categorise predicts product categories based on the neural category classifier. deepest_only: controls whether the returned predictions should only contain the deepmost categories for a predicted taxonomy chain. For example, if we predict 'fresh vegetables' -> 'legumes' -> 'beans' for a product, setting deepest_only=True will return 'beans'.""" from robotoff.prediction.category.neural.category_classifier import ( CategoryClassifier, ) from robotoff.products import get_product from robotoff.taxonomy import TaxonomyType, get_taxonomy product = get_product(barcode) if product is None: print(f"Product {barcode} not found") return predictions = CategoryClassifier(get_taxonomy( TaxonomyType.category.name)).predict(product, deepest_only) if predictions: for prediction in predictions: print(f"{prediction.value_tag}: {prediction.data['confidence']}") else: print(f"Nothing predicted for product {barcode}")
def extract_ingredients_from_taxonomy(lang: str): taxonomy = get_taxonomy(TaxonomyType.ingredient.name) ingredients = set() for key, node in taxonomy.nodes.items(): synonyms: List[str] = node.get_synonyms(lang) for synonym in synonyms: ingredients.add(synonym.lower()) return ingredients
def format_question(self, insight: ProductInsight, lang: str) -> Question: value: str = insight.value_tag taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name) localized_value: str = taxonomy.get_localized_name(value, lang) localized_question = self.translation_store.gettext( lang, self.question) source_image_url = self.get_source_image_url(insight.barcode) return AddBinaryQuestion(question=localized_question, value=localized_value, insight=insight, source_image_url=source_image_url)
def category_export(): logger.info("Starting category export to Elasticsearch...") client = get_es_client() category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name) logger.info("Deleting existing categories...") delete_categories(client) logger.info("Starting export...") category_data = generate_category_data(category_taxonomy) rows_inserted = perform_export(client, category_data, settings.ELASTICSEARCH_CATEGORY_INDEX) logger.info("%d rows inserted" % rows_inserted)
def generate_candidates( cls, product: Product, predictions: List[Prediction], ) -> Iterator[ProductInsight]: candidates = [ prediction for prediction in predictions if cls.is_prediction_valid( product, prediction.value_tag) # type: ignore ] taxonomy = get_taxonomy(InsightType.category.name) yield from (ProductInsight(**candidate.to_dict()) for candidate in select_deepest_taxonomized_candidates( candidates, taxonomy))
def on_get(self, req: falcon.Request, resp: falcon.Response): barcode = req.get_param("barcode", required=True) deepest_only = req.get_param_as_bool("deepest_only", default=False) categories = [] product = get_product(barcode) if product: predictions = CategoryClassifier( get_taxonomy(TaxonomyType.category.name) ).predict(product, deepest_only) categories = [p.to_dict() for p in predictions] resp.media = {"categories": categories}
def generate_category_data() -> Iterable[Tuple[str, Dict]]: category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name) for category_node in category_taxonomy.iter_nodes(): supported_langs = [ lang for lang in category_node.names if lang in SUPPORTED_LANG ] data = { "{}:name".format(lang): category_node.names[lang] for lang in supported_langs } data["id"] = category_node.id id_ = hashlib.sha256(category_node.id.encode("utf-8")).hexdigest() yield id_, data
def generate_candidates( cls, product: Product, predictions: List[Prediction], ) -> Iterator[ProductInsight]: candidates = [ prediction for prediction in predictions if cls.is_prediction_valid( product, prediction.value_tag) # type: ignore ] taxonomy = get_taxonomy(InsightType.label.name) for candidate in select_deepest_taxonomized_candidates( candidates, taxonomy): insight = ProductInsight(**candidate.to_dict()) if insight.automatic_processing is None: insight.automatic_processing = ( candidate.value_tag in AUTHORIZED_LABELS_STORE.get()) yield insight
def add_category_insight(barcode: str, product: JSONType, server_domain: str) -> bool: """Predict categories for product and import predicted category insight. :param barcode: product barcode :param product: product as retrieved from application :param server_domain: the server the product belongs to :return: True if at least one category insight was imported """ if get_server_type(server_domain) != ServerType.off: return False logger.info("Predicting product categories...") # predict category using Elasticsearch on title product_predictions = [] es_prediction = predict_category_from_product_es(product) if es_prediction is not None: product_predictions.append(es_prediction) # predict category using neural model neural_predictions = [] try: neural_predictions = CategoryClassifier( get_taxonomy(TaxonomyType.category.name)).predict(product) except requests.exceptions.HTTPError as e: resp = e.response logger.error( f"Category classifier returned an error: {resp.status_code}: %s", resp.text) for neural_prediction in neural_predictions: neural_prediction.barcode = barcode product_predictions.append(neural_prediction) if len(product_predictions) < 1: return False imported = import_insights(product_predictions, server_domain, automatic=True) logger.info(f"{imported} category insight imported for product {barcode}") return bool(imported)
def is_valid(self, insight: ProductInsight, product: Optional[Product] = None) -> bool: if product is None: product = self.product_store[insight.barcode] product_categories_tags = getattr(product, "categories_tags", []) category_tag = insight.value_tag if category_tag in product_categories_tags: return False # Check that the predicted category is not a parent of a # current/already predicted category category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name) if category_tag in category_taxonomy and category_taxonomy.is_parent_of_any( category_tag, product_categories_tags): return False return True
def is_valid(self, insight: ProductInsight, product: Optional[Product] = None) -> bool: if product is None: product = self.product_store[insight.barcode] product_labels_tags = getattr(product, 'labels_tags', []) label_tag = insight.value_tag if label_tag in product_labels_tags: return False # Check that the predicted label is not a parent of a # current/already predicted label label_taxonomy: Taxonomy = get_taxonomy(InsightType.label.name) if (label_tag in label_taxonomy and label_taxonomy.is_parent_of_any( label_tag, product_labels_tags)): return False return True
def is_valid( self, product: Optional[Product], barcode: str, category: str, seen_set: Set[str], ): product_categories_tags = getattr(product, "categories_tags", []) if category in product_categories_tags: logger.debug("The product already belongs to this category, " "considering the insight as invalid") return False if category in seen_set: logger.debug("An insight already exists for this product and " "category, considering the insight as invalid") return False # Check that the predicted category is not a parent of a # current/already predicted category category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name) if category in category_taxonomy: category_node: TaxonomyNode = category_taxonomy[category] to_check_categories = set(product_categories_tags).union(seen_set) for other_category_node in ( category_taxonomy[to_check_category] for to_check_category in to_check_categories): if other_category_node is not None and other_category_node.is_child_of( category_node): logger.debug( "The predicted category is a parent of the product " "category or of the predicted category of an insight, " "considering the insight as invalid") return False return True
def is_parent_label(cls, tag: str, to_check_labels: Set[str]) -> bool: # Check that the predicted label is not a parent of a # current/already predicted label return get_taxonomy(InsightType.label.name).is_parent_of_any( tag, to_check_labels, raises=False)
def is_parent_category(cls, category: str, to_check_categories: Set[str]): # Check that the predicted category is not a parent of a # current/already predicted category return get_taxonomy(InsightType.category.name).is_parent_of_any( category, to_check_categories, raises=False)
from robotoff.taxonomy import get_taxonomy import csv for taxonomy_name in ("ingredient", "category", "label"): taxonomy = get_taxonomy(taxonomy_name) with open(f"{taxonomy_name}.tsv", "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=["id", "name", "description"]) writer.writeheader() for node in taxonomy.iter_nodes(): name = node.get_localized_name("en") if name != node.id: writer.writerow({"id": node.id, "name": name, "description": name})
def test_select_deepest_taxonomized_candidates(candidates, taxonomy_name, kept_indices): taxonomy = get_taxonomy(taxonomy_name) assert select_deepest_taxonomized_candidates( candidates, taxonomy) == [candidates[idx] for idx in kept_indices]