def process_insights( self, data: Iterable[ProductInsights], server_domain: str, automatic: bool = False, ) -> Iterator[Insight]: seen_set: Set[Tuple[str, str]] = set( (x.barcode, x.data["lang"]) for x in ProductInsight.select( ProductInsight.barcode, ProductInsight.data).where( ProductInsight.type == self.get_type(), ProductInsight.server_domain == server_domain, ProductInsight.annotation.is_null(True), ).iterator()) for product_insights in data: barcode = product_insights.barcode for insight in product_insights.insights: lang = insight.data["lang"] key = (barcode, lang) if key not in seen_set: seen_set.add(key) else: continue yield Insight.from_raw_insight(insight, product_insights, latent=False)
def process_product_insights(self, barcode: str, insights: List[JSONType]) \ -> Iterable[JSONType]: if len(insights) > 1: logger.info("{} distinct expiration dates found for product " "{}, aborting import".format(len(insights), barcode)) return if ProductInsight.select().where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode).count(): return for insight in insights: content = insight['content'] if not self.is_valid(barcode): continue source = insight['source'] yield { 'source_image': source, 'data': { 'source': source, 'notify': content['notify'], **content } } break
def test_annotate_insight_not_enough_votes(client): result = client.simulate_post( "/api/v1/insights/annotate", params={ "insight_id": insight_id, "annotation": -1, "device_id": "voter1", }, ) assert result.status_code == 200 assert result.json == { "description": "the annotation vote was saved", "status": "vote_saved", } # For non-authenticated users we expect the insight to not be validated, with only a vote being cast. votes = list(AnnotationVote.select().dicts()) assert len(votes) == 1 assert votes[0]["value"] == -1 assert votes[0]["username"] is None assert votes[0]["device_id"] == "voter1" insight = next(ProductInsight.select().where( ProductInsight.id == insight_id).dicts().iterator()) assert not any(insight[key] for key in ("username", "completed_at", "annotation")) assert insight.items() > {"n_votes": 1}.items()
def batch_annotate(insight_type: str, dry: bool = True, json_contains: Optional[Dict] = None): annotator = InsightAnnotatorFactory.get(insight_type) i = 0 query = ProductInsight.select() where_clauses = [ ProductInsight.type == insight_type, ProductInsight.annotation.is_null() ] if json_contains is not None: where_clauses.append(ProductInsight.data.contains(json_contains)) query = query.where(*where_clauses) if dry: count = query.count() print("-- dry run --\n" "{} items matching filter:\n" " insight type: {}\n" " filter: {}" "".format(count, insight_type, json_contains)) else: for insight in query: i += 1 print("Insight %d" % i) print("Add label {} to https://fr.openfoodfacts.org/produit/{}" "".format(insight.data, insight.barcode)) print(insight.data) annotator.annotate(insight, 1, update=True)
def process_insights(): processed = 0 for insight in (ProductInsight.select().where( ProductInsight.annotation.is_null(), ProductInsight.process_after.is_null(False), ProductInsight.process_after <= datetime.datetime.utcnow(), ).iterator()): try: annotator = InsightAnnotatorFactory.get(insight.type) logger.info("Annotating insight %s (product: %s)", insight.id, insight.barcode) annotation_result = annotator.annotate(insight, 1, update=True) processed += 1 if annotation_result == UPDATED_ANNOTATION_RESULT and insight.data.get( "notify", False): slack.NotifierFactory.get_notifier( ).notify_automatic_processing(insight) except Exception as e: # continue to the next one # Note: annotator already rolled-back the transaction logger.exception( f"exception {e} while handling annotation of insight %s (product) %s", insight.id, insight.barcode, ) logger.info("%d insights processed", processed)
def test_annotate_insight_authenticated(client): result = client.simulate_post( "/api/v1/insights/annotate", params={ "insight_id": insight_id, "annotation": -1, }, headers={ "Authorization": "Basic " + base64.b64encode(b"a:b").decode("ascii") }, ) assert result.status_code == 200 assert result.json == { "description": "the annotation was saved", "status": "saved" } # For authenticated users we expect the insight to be validated directly, tracking the username of the annotator. votes = list(AnnotationVote.select()) assert len(votes) == 0 insight = next(ProductInsight.select().where( ProductInsight.id == insight_id).dicts().iterator()) assert insight.items() > { "username": "******", "annotation": -1, "n_votes": 0 }.items() assert "completed_at" in insight
def generate_fiber_quality_facet(): product_store: DBProductStore = get_product_store() collection = product_store.collection added = 0 seen_set: Set[str] = set() for insight in (ProductInsight.select( ProductInsight.barcode, ProductInsight.source_image).where( ProductInsight.type == InsightType.nutrient_mention.name, ProductInsight.data["mentions"].contains("fiber"), ProductInsight.source_image.is_null(False), ).iterator()): barcode = insight.barcode if barcode in seen_set: continue product = product_store.get_product( barcode, ["nutriments", "data_quality_tags", "images"]) if product is None: continue nutriments = product.get("nutriments", {}) data_quality_tags = product.get("data_quality_tags", {}) images = product.get("images", {}) if (not is_valid_image(images, insight.source_image) or "fiber" in nutriments or "fiber_prepared" in nutriments): continue facets = [] if FIBER_QUALITY_FACET_NAME not in data_quality_tags: facets.append(FIBER_QUALITY_FACET_NAME) if (FIBER_NUTRITION_QUALITY_FACET_NAME not in data_quality_tags and is_nutrition_image(images, insight.source_image)): facets.append(FIBER_NUTRITION_QUALITY_FACET_NAME) if not facets: continue logger.info("Adding facets to {}: {}".format(barcode, facets)) seen_set.add(barcode) added += 1 collection.update_one( {"code": barcode}, { "$push": { "data_quality_tags": { "$each": facets }, "data_quality_warnings_tags": { "$each": facets }, } }, ) logger.info("Fiber quality facets added on {} products".format(added))
def process_product_insights(self, barcode: str, insights: List[JSONType]) \ -> Iterable[JSONType]: code_seen: Set[str] = set() for t in (ProductInsight.select( ProductInsight.data['text'].as_json().alias('text')).where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode)).iterator(): code_seen.add(t.text) for insight in insights: content = insight['content'] emb_code = content['text'] if not self.is_valid(barcode, emb_code, code_seen): continue source = insight['source'] yield { 'source_image': source, 'data': { 'source': source, 'matcher_type': content['type'], 'raw': content['raw'], 'text': emb_code, 'notify': content['notify'], } } code_seen.add(emb_code)
def generate_seen_set_query(insight_type: str, barcode: str, server_domain: str): return ProductInsight.select(ProductInsight.value_tag).where( ProductInsight.type == insight_type, ProductInsight.barcode == barcode, ProductInsight.server_domain == server_domain, )
def process_product_insights(self, barcode: str, insights: List[JSONType], server_domain: str) -> Iterable[JSONType]: seen_set: Set[str] = set() for t in (ProductInsight.select(ProductInsight.value).where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode, ProductInsight.server_domain == server_domain, )).iterator(): seen_set.add(t.value) for insight in insights: content = insight["content"] emb_code = content["text"] if not self.is_valid(barcode, emb_code, seen_set): continue yield { "source_image": insight["source"], "value": emb_code, "data": { "matcher_type": content["type"], "raw": content["raw"], "notify": content["notify"], }, } seen_set.add(emb_code)
def run(insight_type: str): count = 0 insight: ProductInsight annotator = InsightAnnotatorFactory.get(insight_type) for insight in ProductInsight.select().where(ProductInsight.type == insight_type, ProductInsight.annotation.is_null())\ .order_by(fn.Random()): if insight.process_after is not None and insight.process_after >= datetime.datetime.utcnow( ): continue if insight_type == InsightType.label.name and insight.value_tag not in AUTHORIZED_LABELS: continue try: is_processable = is_automatically_processable(insight) except InvalidInsight: logger.info("Deleting insight {}".format(insight.id)) insight.delete_instance() continue if is_processable: logger.info("Annotating insight {} (barcode: {})".format( insight.value_tag, insight.barcode)) annotator.annotate(insight, 1, update=True) count += 1 logger.info("Annotated insights: {}".format(count))
def process_product_insights(self, barcode: str, insights: List[JSONType]) \ -> Iterable[JSONType]: label_seen: Set[str] = set() for t in (ProductInsight.select(ProductInsight.value_tag).where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode)).iterator(): label_seen.add(t.value_tag) for insight in insights: barcode = insight['barcode'] content = insight['content'] label_tag = content['label_tag'] if not self.is_valid(barcode, label_tag, label_seen): continue source = insight['source'] automatic_processing = content.pop('automatic_processing', None) insert = { 'value_tag': label_tag, 'source_image': source, 'data': { 'source': source, **content } } if automatic_processing is not None: insert['automatic_processing'] = automatic_processing yield insert label_seen.add(label_tag)
def test_import_product_not_in_store(self, predictions): # we should not create insight for non existing products ! imported = self._run_import(predictions, product_store={barcode1: None}) assert imported == 0 # no insight created assert ProductInsight.select().count() == 1
def process_insights(): processed = 0 with db: for insight in ( ProductInsight.select() .where( ProductInsight.annotation.is_null(), ProductInsight.process_after.is_null(False), ProductInsight.process_after <= datetime.datetime.utcnow(), ProductInsight.latent == False, # noqa: E712 ) .iterator() ): annotator = InsightAnnotatorFactory.get(insight.type) logger.info( "Annotating insight {} (product: {})".format( insight.id, insight.barcode ) ) annotation_result = annotator.annotate(insight, 1, update=True) processed += 1 if annotation_result == UPDATED_ANNOTATION_RESULT and insight.data.get( "notify", False ): slack.notify_automatic_processing(insight) logger.info("{} insights processed".format(processed))
def apply_insights(insight_type: str, max_timedelta: datetime.timedelta): logger.info("Timedelta: {}".format(max_timedelta)) count = 0 insight: ProductInsight annotator = InsightAnnotatorFactory.get(insight_type) authorized_labels: Set[str] = AUTHORIZED_LABELS_STORE.get() for insight in (ProductInsight.select().where( ProductInsight.type == insight_type, ProductInsight.annotation.is_null(), ).order_by(fn.Random())): if (insight.process_after is not None and insight.process_after >= datetime.datetime.utcnow()): continue if (insight_type == InsightType.label.name and insight.value_tag not in authorized_labels): continue try: is_processable = is_automatically_processable( insight, max_timedelta) except InvalidInsight: logger.info("Deleting insight {}".format(insight.id)) insight.delete_instance() continue if is_processable: logger.info("Annotating insight {} (barcode: {})".format( insight.value_tag or insight.value, insight.barcode)) annotator.annotate(insight, 1, update=True, automatic=True) count += 1 logger.info("Annotated insights: {}".format(count))
def updated_product_update_insights(barcode: str): product_dict = get_product(barcode) if product_dict is None: logger.warn("Updated product does not exist: {}".format(barcode)) return category_added = updated_product_add_category_insight( barcode, product_dict) if category_added: logger.info("Product {} updated".format(barcode)) product = Product(product_dict) validators: Dict[str, InsightValidator] = {} for insight in (ProductInsight.select().where( ProductInsight.annotation.is_null(), ProductInsight.barcode == barcode).iterator()): if insight.type not in validators: validators[insight.type] = InsightValidatorFactory.create( insight.type, None) validator = validators[insight.type] insight_deleted = delete_invalid_insight(insight, validator=validator, product=product) if insight_deleted: logger.info("Insight {} deleted (type: {})".format( insight.id, insight.type))
def mark_insights(): marked = 0 with db: with db.atomic(): for insight in ( ProductInsight.select() .where( ProductInsight.automatic_processing == True, # noqa: E712 ProductInsight.latent == False, # noqa: E712 ProductInsight.process_after.is_null(), ProductInsight.annotation.is_null(), ) .iterator() ): logger.info( "Marking insight {} as processable automatically " "(product: {})".format(insight.id, insight.barcode) ) insight.process_after = datetime.datetime.utcnow() + datetime.timedelta( minutes=10 ) insight.save() marked += 1 logger.info("{} insights marked".format(marked))
def refresh_insights(with_deletion: bool = False): deleted = 0 updated = 0 product_store = CACHED_PRODUCT_STORE.get() datetime_threshold = datetime.datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) dataset_datetime = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_MIN_DATASET_PATH)) if dataset_datetime.date() != datetime_threshold.date(): logger.warn( "Dataset version is not up to date, aborting insight removal job") return validators: Dict[str, InsightValidator] = {} with db: with db.atomic(): for insight in (ProductInsight.select().where( ProductInsight.annotation.is_null(), ProductInsight.timestamp <= datetime_threshold, ProductInsight.server_domain == settings.OFF_SERVER_DOMAIN, ).iterator()): product: Product = product_store[insight.barcode] if product is None: if with_deletion: # Product has been deleted from OFF logger.info("Product with barcode {} deleted" "".format(insight.barcode)) deleted += 1 insight.delete_instance() else: if insight.type not in validators: validators[ insight.type] = InsightValidatorFactory.create( insight.type, product_store) validator = validators[insight.type] insight_deleted = delete_invalid_insight( insight, validator) if insight_deleted: deleted += 1 logger.info( "invalid insight {} (type: {}), deleting..." "".format(insight.id, insight.type)) continue insight_updated = update_insight_attributes( product, insight) if insight_updated: updated += 1 logger.info("{} insights deleted".format(deleted)) logger.info("{} insights updated".format(updated))
def get_insights(barcode: Optional[str] = None, keep_types: List[str] = None, country: str = None, brands: List[str] = None, count=25) -> Iterable[ProductInsight]: where_clauses = [ ProductInsight.annotation.is_null(), ] if barcode: where_clauses.append(ProductInsight.barcode == barcode) if keep_types: where_clauses.append(ProductInsight.type.in_(keep_types)) if country is not None: where_clauses.append(ProductInsight.countries.contains( country)) if brands: where_clauses.append(ProductInsight.brands.contains_any( brands)) query = (ProductInsight.select() .where(*where_clauses) .limit(count) .order_by(peewee.fn.Random())) return query.iterator()
def run(): count = 0 errors = 0 insight: ProductInsight for insight in ProductInsight.select( ProductInsight.data, ProductInsight.value, ProductInsight.value_tag, ProductInsight.source_image, ): save = False if "source" in insight.data: data_source_image = insight.data["source"] if data_source_image == insight.source_image: insight.data.pop("source") logger.info("Deleting source field for insight {}".format( insight.id)) count += 1 save = True else: errors += 1 if insight.type == "label": if check_tag_field(insight, "label_tag"): save = True elif insight.type == "brand": if check_tag_field(insight, "brand_tag"): save = True if check_field(insight, "brand"): save = True elif insight.type == "store": if check_tag_field(insight, "store_tag"): save = True if check_field(insight, "store"): save = True elif insight.type == "packaging": if check_tag_field(insight, "packaging_tag"): save = True if check_field(insight, "packaging"): save = True elif insight.type == "category": if check_tag_field(insight, "category"): save = True if save: insight.save() logger.info("Updated insights: {}".format(count)) logger.info("Errors: {}".format(errors))
def exist_latent(latent_insight: JSONType) -> bool: return bool(ProductInsight.select().where( ProductInsight.barcode == latent_insight["barcode"], ProductInsight.type == latent_insight["type"], ProductInsight.server_domain == latent_insight["server_domain"], ProductInsight.value_tag == latent_insight.get("value_tag"), ProductInsight.value == latent_insight.get("value"), ProductInsight.source_image == latent_insight.get("source_image"), ).count())
def generate_seen_set_query(insight_type: InsightType, barcode: str, server_domain: str): return ProductInsight.select( ProductInsight.value, ProductInsight.value_tag).where( ProductInsight.type == insight_type.name, ProductInsight.latent == False, # noqa: E712 ProductInsight.barcode == barcode, ProductInsight.server_domain == server_domain, )
def test_import_one(self, predictions): imported = self._run_import(predictions) assert imported == 1 # no insight created assert ProductInsight.select().count() == 1 inserted = ProductInsight.get(ProductInsight.id != insight_id1) assert inserted.value_tag == "en:smoked-salmons" assert inserted.server_domain == settings.OFF_SERVER_DOMAIN assert not inserted.automatic_processing
def process_product_insights(self, insights: Iterable[JSONType], automatic: bool) \ -> Iterable[JSONType]: category_seen: Dict[str, Set[str]] = {} for t in (ProductInsight.select( ProductInsight.value_tag, ProductInsight.barcode).where( ProductInsight.type == self.get_type())).iterator(): category_seen.setdefault(t.barcode, set()) category_seen[t.barcode].add(t.value_tag) timestamp = datetime.datetime.utcnow() for insight in insights: barcode = insight['barcode'] category = insight['category'] if not self.is_valid(barcode, category, category_seen): continue countries_tags = getattr(self.product_store[barcode], 'countries_tags', []) brands_tags = getattr(self.product_store[barcode], 'brands_tags', []) insert = { 'id': str(uuid.uuid4()), 'type': self.get_type(), 'barcode': barcode, 'countries': countries_tags, 'brands': brands_tags, 'timestamp': timestamp, 'value_tag': category, 'automatic_processing': False, 'data': { 'category': category, } } if 'category_depth' in insight: insert['data']['category_depth'] = insight['category_depth'] if 'model' in insight: insert['data']['model'] = insight['model'] if 'confidence' in insight: insert['data']['confidence'] = insight['confidence'] if 'product_name' in insight: insert['data']['product_name'] = insight['product_name'] if 'lang' in insight: insert['data']['lang'] = insight['lang'] yield insert category_seen.setdefault(barcode, set()) category_seen[barcode].add(category)
def test_import_auto(self): imported = self._run_import([ neural_prediction("en:smoked-salmons", confidence=0.91, auto=True) ]) assert imported == 1 # no insight created assert ProductInsight.select().count() == 1 inserted = ProductInsight.get(ProductInsight.id != insight_id1) assert inserted.value_tag == "en:smoked-salmons" assert inserted.server_domain == settings.OFF_SERVER_DOMAIN assert inserted.automatic_processing
def test_vote_cascade_on_insight_deletion(peewee_db): """Test AnnotationVote is cascading on insight deletion""" with peewee_db.atomic(): insight = ProductInsightFactory(n_votes=2, ) AnnotationVoteFactory(insight_id=insight, ) AnnotationVoteFactory(insight_id=insight, ) with peewee_db.atomic(): insight.delete().execute() assert ProductInsight.select().count() == 0 assert AnnotationVote.select().count() == 0
def test_annotate_insight_majority_vote_overridden(client): # Add pre-existing insight votes. AnnotationVoteFactory( insight_id=insight_id, value=1, device_id="yes-voter1", ) AnnotationVoteFactory( insight_id=insight_id, value=1, device_id="yes-voter2", ) AnnotationVoteFactory( insight_id=insight_id, value=-1, device_id="no-voter1", ) AnnotationVoteFactory( insight_id=insight_id, value=-1, device_id="no-voter2", ) result = client.simulate_post( "/api/v1/insights/annotate", params={ "insight_id": insight_id, "device_id": "no-voter3", "annotation": -1, "update": False, # disable actually updating the product in PO. }, ) assert result.status_code == 200 assert result.json == { "description": "the annotation was saved", "status": "saved" } votes = list(AnnotationVote.select()) assert len(votes) == 5 insight = next(ProductInsight.select().where( ProductInsight.id == insight_id).dicts().iterator()) # The insight should be annoted with '0', with a None username since this was resolved with an # anonymous vote. assert insight.items() > { "annotation": 0, "username": None, "n_votes": 5 }.items()
def get_image_orientation(barcode: str, image_id: str) -> Optional[int]: for insight in (ProductInsight.select( ProductInsight.data, ProductInsight.source_image).where( ProductInsight.barcode == barcode, ProductInsight.type == InsightType.image_orientation.name, ProductInsight.server_domain == settings.OFF_SERVER_DOMAIN, ProductInsight.source_image.is_null(False), ).iterator()): insight_image_id = get_image_id(insight.source_image) # type: ignore if image_id is not None and insight_image_id == image_id: return insight.data.get("rotation") return None
def get_existing_insight(insight_type: InsightType, barcode: str, server_domain: str) -> List[ProductInsight]: """Get `value` and `value_tag` of all insights for specific product and `insight_type`.""" return list( ProductInsight.select( ProductInsight.annotation, ProductInsight.id, ProductInsight.value, ProductInsight.value_tag, ).where( ProductInsight.type == insight_type.name, ProductInsight.barcode == barcode, ProductInsight.server_domain == server_domain, ))
def update_insights(barcode: str, server_domain: str): # Sleep 10s to let the OFF update request that triggered the webhook call # to finish time.sleep(10) product_dict = get_product(barcode) if product_dict is None: logger.warn("Updated product does not exist: {}".format(barcode)) return updated = updated_product_predict_insights(barcode, product_dict, server_domain) if updated: logger.info("Product {} updated".format(barcode)) update_ingredients(barcode, product_dict, server_domain) product = Product(product_dict) validators: Dict[str, Optional[InsightValidator]] = {} for insight in ( ProductInsight.select() .where( ProductInsight.annotation.is_null(), ProductInsight.barcode == barcode, ProductInsight.server_domain == server_domain, ) .iterator() ): if insight.type not in validators: validators[insight.type] = InsightValidatorFactory.create( insight.type, None ) validator = validators[insight.type] if validator is not None: result = validate_insight(insight, validator=validator, product=product) if result == InsightValidationResult.deleted: logger.info( "Insight {} deleted (type: {})".format(insight.id, insight.type) ) elif result == InsightValidationResult.updated: logger.info( "Insight {} converted to latent (type: {})".format( insight.id, insight.type ) )