def generate_insights( self, max_errors: Optional[int] = None, lang: str = "fr", limit: Optional[int] = None, ) -> Iterable[Prediction]: dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_text_field( "lang", lang).filter_nonempty_text_field("ingredients_text_fr").iter()) insights_count = 0 for product in product_iter: if self.is_product_valid(product, max_errors=max_errors): insight = self.predict_insight(product["ingredients_text_fr"]) if insight is not None: insight["lang"] = lang yield Prediction( type=PredictionType.ingredient_spellcheck, data=insight, barcode=product["code"], ) insights_count += 1 if limit is not None and insights_count >= limit: break
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None): category_taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name) category_classifier = CategoryClassifier(category_taxonomy) dataset: ProductDataset = ProductDataset.load() train_df, test_df = category_classifier.train(dataset) category_classifier.save(str(model_output_dir)) test_metrics = category_classifier.evaluate(test_df) dataset_timestamp = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_DATASET_PATH)) meta = { "metrics": { "test": test_metrics }, "dataset_id": dataset_timestamp.date().isoformat(), "training_set_count": len(train_df), "test_set_count": len(test_df), } if comment: meta["comment"] = comment with open(str(model_output_dir / "meta.json"), "w") as f: json.dump(meta, f)
def main(): dataset = ProductDataset.load() training_stream = (dataset.stream().filter_text_field( "lang", lang).filter_nonempty_text_field("product_name_{}".format(lang))) generate_category_insights(training_stream.iter(), batch_size=1024)
def generate_prediction_df(self, dataset: ProductDataset) -> pd.DataFrame: dataset_iter = ( dataset.stream() .filter_by_country_tag("en:france") .filter_nonempty_text_field("product_name") ) return pd.DataFrame((self.transform_product(p) for p in dataset_iter))
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None): category_taxonomy: Taxonomy = TAXONOMY_STORES[ TaxonomyType.category.name].get() category_classifier = CategoryClassifier(category_taxonomy) dataset: ProductDataset = ProductDataset.load() train_df, test_df = category_classifier.train(dataset) category_classifier.save(str(model_output_dir)) test_metrics = category_classifier.evaluate(test_df) dataset_timestamp = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_DATASET_PATH)) meta = { 'metrics': { 'test': test_metrics, }, 'dataset_id': dataset_timestamp.date().isoformat(), 'training_set_count': len(train_df), 'test_set_count': len(test_df), } if comment: meta['comment'] = comment with open(str(model_output_dir / 'meta.json'), 'w') as f: json.dump(meta, f)
def predict_category(output: str): from robotoff.elasticsearch.category.predict import predict_from_dataset from robotoff.utils import dump_jsonl from robotoff.products import ProductDataset from robotoff import settings dataset = ProductDataset(settings.JSONL_DATASET_PATH) dump_jsonl(output, predict_from_dataset(dataset))
def compute_brand_prefix( product_dataset: ProductDataset, threshold: Optional[int] = None ) -> Dict[Tuple[str, str], int]: count: Dict[Tuple[str, str], int] = {} for product in ( product_dataset.stream() .filter_nonempty_tag_field("brands_tags") .filter_nonempty_text_field("code") ): brand_tags = set(x for x in product["brands_tags"] if x) barcode = product["code"] if len(barcode) == 13: barcode_prefix = generate_barcode_prefix(barcode) for brand_tag in brand_tags: key = (brand_tag, barcode_prefix) count.setdefault(key, 0) count[key] += 1 if threshold: for key in list(count.keys()): if count[key] < threshold: count.pop(key) return count
def predict_from_dataset( dataset: ProductDataset, from_datetime: Optional[datetime.datetime] = None ) -> Iterable[JSONType]: """Return an iterable of category insights, using the provided dataset. Args: dataset: a ProductDataset from_datetime: datetime threshold: only keep products modified after `from_datetime` """ product_stream = ( dataset.stream().filter_nonempty_text_field("code"). filter_nonempty_text_field("product_name").filter_empty_tag_field( "categories_tags").filter_nonempty_tag_field( "countries_tags").filter_nonempty_tag_field("languages_codes")) if from_datetime: product_stream = product_stream.filter_by_modified_datetime( from_t=from_datetime) product_iter = product_stream.iter() logger.info("Performing prediction on products without categories") es_client = get_es_client() yield from predict_from_iterable(es_client, product_iter)
def save_brand_prefix(count_threshold: int): product_dataset = ProductDataset(settings.JSONL_DATASET_PATH) counts = compute_brand_prefix(product_dataset, threshold=count_threshold) brand_prefixes = list(counts.keys()) with settings.BRAND_PREFIX_PATH.open("w") as f: json.dump(brand_prefixes, f)
def main(): dataset = ProductDataset.load() training_stream = (dataset.stream().filter_text_field( 'lang', lang).filter_nonempty_text_field('product_name_{}'.format(lang))) updated_product_add_category_insight(training_stream.iter(), batch_size=1024)
def predict_category(output: str): from robotoff.elasticsearch.category.predict import predict_from_dataset from robotoff.utils import dump_jsonl from robotoff.products import ProductDataset from robotoff import settings dataset = ProductDataset(settings.JSONL_DATASET_PATH) insights = predict_from_dataset(dataset) dict_insights = (i.to_dict() for i in insights) dump_jsonl(output, dict_insights)
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field( 'ingredients_text_fr').filter_by_state_tag('en:complete').iter()) product_iter = (p for p in product_iter if 'ingredients-unknown-score-above-0' not in p.get( 'quality_tags', [])) data = ((product['code'], { 'ingredients_text_fr': normalize_ingredient_list(product['ingredients_text_fr']) }) for product in product_iter) logger.info("Importing products") es_client = get_es_client() perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
def generate_product_data() -> Iterable[Tuple[str, Dict]]: dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_stream = (dataset.stream().filter_text_field( "lang", "fr").filter_by_country_tag("en:france").filter_nonempty_text_field( "ingredients_text_fr").filter_by_state_tag("en:complete")) product_iter = product_stream.iter() product_iter = (p for p in product_iter if int(p.get("unknown_ingredients_n", 0)) == 0) return (( product["code"], { "ingredients_text_fr": normalize_ingredient_list(product["ingredients_text_fr"]) }, ) for product in product_iter)
def generate_insights(client, confidence=1): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_nonempty_text_field("ingredients_text_fr").iter()) for product in product_iter: text = product["ingredients_text_fr"] corrections = generate_corrections(client, text, confidence=confidence) if not corrections: continue term_corrections = list( itertools.chain.from_iterable( (c.term_corrections for c in corrections))) yield { "corrections": [dataclasses.asdict(c) for c in term_corrections], "text": text, "corrected": generate_corrected_text(term_corrections, text), "barcode": product["code"], }
def generate_insights(): """Generate and import category insights from the latest dataset dump, for products added at day-1.""" logger.info("Generating new category insights") product_store: ProductStore = CACHED_PRODUCT_STORE.get() importer = CategoryImporter(product_store) datetime_threshold = datetime.datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1) dataset = ProductDataset(settings.JSONL_DATASET_PATH) category_insights_iter = predict_from_dataset(dataset, datetime_threshold) imported = importer.import_insights(category_insights_iter) logger.info("{} category insights imported".format(imported))
def generate_insights(client, confidence=1): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field('ingredients_text_fr').iter()) for product in product_iter: text = product['ingredients_text_fr'] corrections = generate_corrections(client, text, confidence=confidence) if not corrections: continue term_corrections = list( itertools.chain.from_iterable( (c.term_corrections for c in corrections))) yield { 'corrections': [dataclasses.asdict(c) for c in term_corrections], 'text': text, 'corrected': generate_corrected_text(term_corrections, text), 'barcode': product['code'], }
def images_dimension_iter(): dataset = ProductDataset.load() for product in dataset.stream().filter_nonempty_text_field("code"): images = product.get("images", {}) for image_id, image_data in images.items(): if not image_id.isdigit(): continue if "full" not in image_data["sizes"]: continue width = image_data["sizes"]["full"]["w"] height = image_data["sizes"]["full"]["h"] yield [int(width), int(height), product["code"], str(image_id)]
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_nonempty_text_field( "ingredients_text_fr").filter_by_state_tag("en:complete").iter()) product_iter = (p for p in product_iter if "ingredients-unknown-score-above-0" not in p.get( "quality_tags", [])) data = (( product["code"], { "ingredients_text_fr": normalize_ingredient_list(product["ingredients_text_fr"]) }, ) for product in product_iter) logger.info("Importing products") es_client = get_es_client() inserted = perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX) logger.info("{} rows inserted".format(inserted))
def update_recycling(username: str, password: str) -> None: """ Function to update "Recycle" image for the product based on triggers """ recycling_triggers = { "en": ["throw away", "recycle"], "fr": ["consignesdetri.fr", "recycler", "jeter", "bouteille"], } # get products dataset dataset = ProductDataset.load() # iterate products for product in dataset.stream().filter_nonempty_text_field("code"): if "packaging-photo-to-be-selected" not in product.get("states", ""): continue product_code = product.get("code") if not product_code: continue images = get_images(product_code) if not images: continue product_images_items = images.get("product", {}).get("images", {}).items() images_ids = {i for i, j in product_images_items if not j.get("imgid")} pack_images = { i: j for i, j in product_images_items if "packaging" in i } for i in images_ids: # imageid - i, product for lang in recycling_triggers.keys(): field = "packaging_{}".format(lang) if check_image_in_pack(i, field, pack_images): continue if not check_trigger_in_text(product_code, i, recycling_triggers[lang]): continue select_image(product_code, i, field, pack_images, username, password)
def generate_insights(): """Generate and import category insights from the latest dataset dump, for products added at day-1.""" logger.info("Generating new category insights") datetime_threshold = datetime.datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1) dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_predictions_iter = predict_from_dataset(dataset, datetime_threshold) imported = import_insights( product_predictions_iter, server_domain=settings.OFF_SERVER_DOMAIN, automatic=False, ) logger.info("{} category insights imported".format(imported))
def run(lang: Optional[str] = None): dataset = ProductDataset.load() training_stream = dataset.stream().filter_nonempty_tag_field( 'categories_tags') if lang is not None: training_stream = (training_stream.filter_text_field( 'lang', lang).filter_nonempty_text_field('product_name_{}'.format(lang))) else: training_stream = training_stream.filter_nonempty_text_field( 'product_name') dataset_iter = generate_dataset(training_stream, lang) count = dump_jsonl( settings.PROJECT_DIR / 'datasets' / 'category' / 'category_{}.jsonl'.format(lang or 'xx'), dataset_iter) print(count)
def run(lang: Optional[str] = None): logger.info("Generating category dataset for lang {}".format(lang or "xx")) dataset = ProductDataset.load() training_stream = dataset.stream().filter_nonempty_tag_field( "categories_tags") if lang is not None: training_stream = training_stream.filter_text_field( "lang", lang).filter_nonempty_text_field("product_name_{}".format(lang)) else: training_stream = training_stream.filter_nonempty_text_field( "product_name") dataset_iter = generate_dataset(training_stream, lang) count = dump_jsonl( settings.PROJECT_DIR / "datasets" / "category" / "category_{}.jsonl".format(lang or "xx"), dataset_iter, ) logger.info("{} items for lang {}".format(count, lang or "xx"))
def generate_training_df(self, dataset: ProductDataset) -> pd.DataFrame: training_dataset_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field( 'product_name').filter_nonempty_tag_field('categories_tags')) training_dataset = [] processed = 0 for product in training_dataset_iter: processed += 1 transformed_product = self.transform_product(product, add_category=True) if 'deepest_category' in transformed_product: training_dataset.append(transformed_product) logger.info("{} training samples discarded (category not in " "taxonomy), {} remaining" "".format(processed - len(training_dataset), len(training_dataset))) return pd.DataFrame(training_dataset)
import pathlib from typing import Set import requests from robotoff.off import generate_image_url from robotoff.products import ProductDataset from robotoff import settings from robotoff.utils import get_logger from robotoff.utils.types import JSONType logger = get_logger() JSONL_SHUF_DATASET_PATH = settings.DATASET_DIR / 'products-shuf.jsonl.gz' ds = ProductDataset(JSONL_SHUF_DATASET_PATH) IMAGE_DATASET_DIR = settings.PROJECT_DIR / 'image_dataset' NUTRITION_TABLE_IMAGE_DIR = IMAGE_DATASET_DIR / 'nutrition-table-2' def load_seen_set() -> Set[str]: seen_set = set() with open(IMAGE_DATASET_DIR / 'dataset.txt') as f: for line in f: if line: line = line.strip('\n') barcode, _ = line.split('_') seen_set.add(barcode) return seen_set
import csv from robotoff import settings from robotoff.insights.ocr.core import get_source from robotoff.products import ProductDataset ds = ProductDataset.load() product_iter = ( ds.stream().filter_by_country_tag('en:france').filter_nonempty_text_field( 'ingredients_text_fr').filter_number_field('unknown_ingredients_n', 2, 0, 'geq').iter()) with open('spellcheck_test_fr.csv', 'w', newline='') as f: writer = csv.writer(f, delimiter=',', dialect='unix') for product in product_iter: if 'images' not in product: continue images = product['images'] if 'ingredients_fr' not in images: continue print(product['unknown_ingredients_n']) barcode = product['code'] url = 'https://world.openfoodfacts.org/product/{}'.format(barcode) rev_id = nutrition_fr_image_url = images['ingredients_fr']['rev'] image_name = "ingredients_fr.{}.400".format(rev_id)