def update_insight_attributes(product: Product, insight: ProductInsight) -> bool: to_update = False if insight.brands != product.brands_tags: logger.info("Updating brand {} -> {} ({})".format( insight.brands, product.brands_tags, product.barcode)) to_update = True insight.brands = product.brands_tags if insight.countries != product.countries_tags: logger.info("Updating countries {} -> {} ({})".format( insight.countries, product.countries_tags, product.barcode)) to_update = True insight.countries = product.countries_tags if insight.unique_scans_n != product.unique_scans_n: logger.info("Updating unique scan count {} -> {} ({})".format( insight.unique_scans_n, product.unique_scans_n, product.barcode)) to_update = True insight.unique_scans_n = product.unique_scans_n if to_update: insight.save() return to_update
def annotate( self, insight: ProductInsight, annotation: int, update=True, auth: Optional[OFFAuthentication] = None, ) -> AnnotationResult: username: Optional[str] = None if auth is not None: username = auth.username if auth.session_cookie: username = extract_username(auth.session_cookie) with db.atomic(): insight.annotation = annotation insight.completed_at = datetime.datetime.utcnow() insight.save() if username: UserAnnotation.create(insight=insight, username=username) if annotation == 1 and update: return self.update_product(insight, auth=auth) return SAVED_ANNOTATION_RESULT
def test_mark_insights(): now = datetime.utcnow() # not automatic not_auto = ProductInsightFactory(automatic_processing=False) # already marked marked = ProductInsightFactory( automatic_processing=True, annotation=None, process_after=now - timedelta(minutes=2), ) # already annotated annotated = ProductInsightFactory(automatic_processing=True, annotation=1) # ready to be marked ready1 = ProductInsightFactory(automatic_processing=True) ready2 = ProductInsightFactory(automatic_processing=True) # run start = datetime.utcnow() num_marked = scheduler.mark_insights() end = datetime.utcnow() ten_min = timedelta(minutes=10) # two marked assert num_marked == 2 assert (start + ten_min < ProductInsight.get(id=ready1.id).process_after < end + ten_min) assert (start + ten_min < ProductInsight.get(id=ready2.id).process_after < end + ten_min) # other did not change assert ProductInsight.get(id=not_auto).process_after is None assert ProductInsight.get(id=annotated).process_after is None assert ProductInsight.get(id=marked).process_after < start # run again should not mark anything more num_marked = scheduler.mark_insights() assert num_marked == 0
def import_insights( cls, predictions: List[Prediction], server_domain: str, automatic: bool, product_store: DBProductStore, ) -> int: """Import insights, this is the main method. :return: the number of insights that were imported. """ required_prediction_types = cls.get_required_prediction_types() for prediction in predictions: if prediction.type not in required_prediction_types: raise ValueError( f"unexpected prediction type: '{prediction.type}'") inserts = 0 for to_create, to_delete in cls.generate_insights( predictions, server_domain, automatic, product_store): if to_delete: to_delete_ids = [insight.id for insight in to_delete] logger.info( f"Deleting insight IDs: {[str(x) for x in to_delete_ids]}") ProductInsight.delete().where( ProductInsight.id.in_(to_delete_ids)).execute() if to_create: inserts += batch_insert( ProductInsight, (model_to_dict(insight) for insight in to_create), 50, ) return inserts
def generate_candidates( cls, product: Product, predictions: List[Prediction], ) -> Iterator[ProductInsight]: if product.quantity is not None or not predictions: # Don't generate candidates if the product weight is already # specified or if there are no predictions return # Only generate a single prediction at a time. # Predictions are sorted by ascending priority, so the first # prediction is assumed to be the best one prediction = predictions[0] insights_by_subtype = cls.group_by_subtype(predictions) insight = ProductInsight(**prediction.to_dict()) if (len( set(x.value for x in insights_by_subtype[insight.data["matcher_type"]]) ) > 1) or insight.data.get("source") == "product_name": # Multiple candidates with the same subtype and value, or product # weight coming from the product name (less accurate that OCR data) # -> don't process automatically insight.automatic_processing = False yield insight
def test_get_insight_update_annotated_reference(self): class TestInsightImporter(InsightImporter): @classmethod def is_conflicting_insight(cls, candidate, reference): return candidate.value_tag == reference.value_tag references = [ ProductInsight( barcode=DEFAULT_BARCODE, type=InsightType.label, value_tag="tag1", id=uuid.UUID("a6aa784b-4d39-4baa-a16c-b2f1c9dac9f9"), annotation=0, ), ] candidates = [ ProductInsight( barcode=DEFAULT_BARCODE, type=InsightType.label, value_tag="tag2", id=uuid.UUID("c984b252-fb31-41ea-b78e-6ca08b9f5e4b"), ), ] ( to_create, to_delete, ) = InsightImporterWithIsConflictingInsight.get_insight_update( candidates, references) assert to_create == candidates # Annotated existing insight should not be deleted assert to_delete == []
def test_process_insight_category(mocker): mocker.patch("robotoff.insights.annotate.get_product", return_value={"categories_tags": []}) mock = mocker.patch("robotoff.off.update_product") # a processed insight exists date0 = datetime.utcnow() - timedelta(minutes=10) id0, code0 = _create_insight(type="category", completed_at=date0, annotation=1) # an insight to be processed id1, code1 = _create_insight(type="category") # run process process_insights() # insight 0 not touched assert ProductInsight.get(id=id0).completed_at == date0 # insight 1 processed insight = ProductInsight.get(id=id1) assert insight.completed_at is not None assert insight.completed_at <= datetime.utcnow() assert insight.annotation == 1 # update_product calledfor item 1 mock.assert_called_once_with( { "code": code1, "add_categories": "en:Salmons", "comment": f"[robotoff] Adding category 'en:Salmons', ID: {id1}", }, auth=None, server_domain=settings.OFF_SERVER_DOMAIN, )
def test_popular_question_pagination(client, mocker): mocker.patch("robotoff.insights.question.get_product", return_value={}) ProductInsight.delete().execute() # remove default sample for i in range(0, 12): ProductInsightFactory(barcode=i, unique_scans_n=100 - i) result = client.simulate_get("/api/v1/questions/popular?count=5&page=1") assert result.status_code == 200 data = result.json assert data["count"] == 12 assert data["status"] == "found" assert [q["barcode"] for q in data["questions"]] == ["0", "1", "2", "3", "4"] result = client.simulate_get("/api/v1/questions/popular?count=5&page=2") assert result.status_code == 200 data = result.json assert data["count"] == 12 assert data["status"] == "found" assert [q["barcode"] for q in data["questions"]] == ["5", "6", "7", "8", "9"] result = client.simulate_get("/api/v1/questions/popular?count=5&page=3") assert result.status_code == 200 data = result.json assert data["count"] == 12 assert data["status"] == "found" assert [q["barcode"] for q in data["questions"]] == ["10", "11"] result = client.simulate_get("/api/v1/questions/popular?count=5&page=4") assert result.status_code == 200 data = result.json assert data["count"] == 12 assert data["status"] == "no_questions" assert len(data["questions"]) == 0
def generate_candidates( cls, product: Product, predictions: List[Prediction], ) -> Iterator[ProductInsight]: for prediction in predictions: insight = ProductInsight(**prediction.to_dict()) insight.automatic_processing = True yield insight
def test_import_one(self, predictions): imported = self._run_import(predictions) assert imported == 1 # no insight created assert ProductInsight.select().count() == 1 inserted = ProductInsight.get(ProductInsight.id != insight_id1) assert inserted.value_tag == "en:smoked-salmons" assert inserted.server_domain == settings.OFF_SERVER_DOMAIN assert not inserted.automatic_processing
def delete_invalid_insight(insight: ProductInsight, validator: Optional[InsightValidator]) -> bool: if validator is None: return False if not validator.is_valid(insight): insight.delete_instance() return True return False
def annotate(self, insight: ProductInsight, annotation: int, update=True) \ -> AnnotationResult: insight.annotation = annotation insight.completed_at = datetime.datetime.utcnow() insight.save() if annotation == 1 and update: return self.update_product(insight) return SAVED_ANNOTATION_RESULT
def process_annotation( self, insight: ProductInsight, data: Optional[Dict] = None, auth: Optional[OFFAuthentication] = None, ) -> AnnotationResult: insight.data["annotation"] = data insight.save() return SAVED_ANNOTATION_RESULT
def test_import_auto(self): imported = self._run_import([ neural_prediction("en:smoked-salmons", confidence=0.91, auto=True) ]) assert imported == 1 # no insight created assert ProductInsight.select().count() == 1 inserted = ProductInsight.get(ProductInsight.id != insight_id1) assert inserted.value_tag == "en:smoked-salmons" assert inserted.server_domain == settings.OFF_SERVER_DOMAIN assert inserted.automatic_processing
def annotate(self, insight: ProductInsight, annotation: int, update=True, session_cookie: Optional[str] = None) -> AnnotationResult: insight.annotation = annotation insight.completed_at = datetime.datetime.utcnow() insight.save() if annotation == 1 and update: return self.update_product(insight, session_cookie=session_cookie) return SAVED_ANNOTATION_RESULT
def test_process_insight_update_product_raises(mocker): def raise_for_salmons(params, *args, **kwargs): if "en:Salmons" in params.values(): raise Exception("Boom !") else: return mocker.patch("robotoff.insights.annotate.get_product", return_value={"categories_tags": []}) mock = mocker.patch("robotoff.off.update_product", side_effect=raise_for_salmons) # an insight to be processed, that will raise id1, code1 = _create_insight(type="category") # add another insight that should pass id2, code2 = _create_insight(type="category", value_tag="en:Tuna") # run process start = datetime.utcnow() process_insights() end = datetime.utcnow() # insight1 not marked processed insight = ProductInsight.get(id=id1) assert insight.completed_at is None assert insight.annotation is None # but update_product was called mock.assert_any_call( { "code": code1, "add_categories": "en:Salmons", "comment": f"[robotoff] Adding category 'en:Salmons', ID: {id1}", }, auth=None, server_domain=settings.OFF_SERVER_DOMAIN, ) # insight2 processed # and update_product was called insight = ProductInsight.get(id=id2) assert insight.completed_at is not None assert start < insight.completed_at < end assert insight.annotation == 1 mock.assert_any_call( { "code": code2, "add_categories": "en:Tuna", "comment": f"[robotoff] Adding category 'en:Tuna', ID: {id2}", }, auth=None, server_domain=settings.OFF_SERVER_DOMAIN, ) # we add only two calls assert mock.call_count == 2
def generate_insights(cls, predictions, server_domain, automatic, product_store): yield [ ProductInsight( barcode=DEFAULT_BARCODE, type=InsightType.label.name, value_tag="tag1", ) ], [ ProductInsight( barcode=DEFAULT_BARCODE, type=InsightType.label.name, value_tag="tag2", ) ]
def process_product_insights(self, barcode: str, insights: List[JSONType]) \ -> Iterable[JSONType]: code_seen: Set[str] = set() for t in (ProductInsight.select( ProductInsight.data['text'].as_json().alias('text')).where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode)).iterator(): code_seen.add(t.text) for insight in insights: content = insight['content'] emb_code = content['text'] if not self.is_valid(barcode, emb_code, code_seen): continue source = insight['source'] yield { 'source_image': source, 'data': { 'source': source, 'matcher_type': content['type'], 'raw': content['raw'], 'text': emb_code, 'notify': content['notify'], } } code_seen.add(emb_code)
def apply_insights(insight_type: str, max_timedelta: datetime.timedelta): logger.info("Timedelta: {}".format(max_timedelta)) count = 0 insight: ProductInsight annotator = InsightAnnotatorFactory.get(insight_type) authorized_labels: Set[str] = AUTHORIZED_LABELS_STORE.get() for insight in (ProductInsight.select().where( ProductInsight.type == insight_type, ProductInsight.annotation.is_null(), ).order_by(fn.Random())): if (insight.process_after is not None and insight.process_after >= datetime.datetime.utcnow()): continue if (insight_type == InsightType.label.name and insight.value_tag not in authorized_labels): continue try: is_processable = is_automatically_processable( insight, max_timedelta) except InvalidInsight: logger.info("Deleting insight {}".format(insight.id)) insight.delete_instance() continue if is_processable: logger.info("Annotating insight {} (barcode: {})".format( insight.value_tag or insight.value, insight.barcode)) annotator.annotate(insight, 1, update=True, automatic=True) count += 1 logger.info("Annotated insights: {}".format(count))
def batch_annotate(insight_type: str, dry: bool = True, json_contains: Optional[Dict] = None): annotator = InsightAnnotatorFactory.get(insight_type) i = 0 query = ProductInsight.select() where_clauses = [ ProductInsight.type == insight_type, ProductInsight.annotation.is_null() ] if json_contains is not None: where_clauses.append(ProductInsight.data.contains(json_contains)) query = query.where(*where_clauses) if dry: count = query.count() print("-- dry run --\n" "{} items matching filter:\n" " insight type: {}\n" " filter: {}" "".format(count, insight_type, json_contains)) else: for insight in query: i += 1 print("Insight %d" % i) print("Add label {} to https://fr.openfoodfacts.org/produit/{}" "".format(insight.data, insight.barcode)) print(insight.data) annotator.annotate(insight, 1, update=True)
def save_insight( insight_id: str, annotation: int, update: bool = True, data: Optional[Dict] = None, auth: Optional[OFFAuthentication] = None, ) -> AnnotationResult: try: insight: Union[ProductInsight, None] = ProductInsight.get_by_id(insight_id) except ProductInsight.DoesNotExist: insight = None if not insight: return UNKNOWN_INSIGHT_RESULT if insight.annotation is not None: return ALREADY_ANNOTATED_RESULT annotator = InsightAnnotatorFactory.get(insight.type) return annotator.annotate(insight, annotation, update, data=data, auth=auth)
def process_insights(): processed = 0 for insight in (ProductInsight.select().where( ProductInsight.annotation.is_null(), ProductInsight.process_after.is_null(False), ProductInsight.process_after <= datetime.datetime.utcnow(), ).iterator()): try: annotator = InsightAnnotatorFactory.get(insight.type) logger.info("Annotating insight %s (product: %s)", insight.id, insight.barcode) annotation_result = annotator.annotate(insight, 1, update=True) processed += 1 if annotation_result == UPDATED_ANNOTATION_RESULT and insight.data.get( "notify", False): slack.NotifierFactory.get_notifier( ).notify_automatic_processing(insight) except Exception as e: # continue to the next one # Note: annotator already rolled-back the transaction logger.exception( f"exception {e} while handling annotation of insight %s (product) %s", insight.id, insight.barcode, ) logger.info("%d insights processed", processed)
def on_get(self, req: falcon.Request, resp: falcon.Response, insight_id: str): try: insight: ProductInsight = ProductInsight.get_by_id(insight_id) except ProductInsight.DoesNotExist: raise falcon.HTTPNotFound() resp.media = insight.to_dict()
def test_annotate_insight_not_enough_votes(client): result = client.simulate_post( "/api/v1/insights/annotate", params={ "insight_id": insight_id, "annotation": -1, "device_id": "voter1", }, ) assert result.status_code == 200 assert result.json == { "description": "the annotation vote was saved", "status": "vote_saved", } # For non-authenticated users we expect the insight to not be validated, with only a vote being cast. votes = list(AnnotationVote.select().dicts()) assert len(votes) == 1 assert votes[0]["value"] == -1 assert votes[0]["username"] is None assert votes[0]["device_id"] == "voter1" insight = next(ProductInsight.select().where( ProductInsight.id == insight_id).dicts().iterator()) assert not any(insight[key] for key in ("username", "completed_at", "annotation")) assert insight.items() > {"n_votes": 1}.items()
def updated_product_update_insights(barcode: str): product_dict = get_product(barcode) if product_dict is None: logger.warn("Updated product does not exist: {}".format(barcode)) return category_added = updated_product_add_category_insight( barcode, product_dict) if category_added: logger.info("Product {} updated".format(barcode)) product = Product(product_dict) validators: Dict[str, InsightValidator] = {} for insight in (ProductInsight.select().where( ProductInsight.annotation.is_null(), ProductInsight.barcode == barcode).iterator()): if insight.type not in validators: validators[insight.type] = InsightValidatorFactory.create( insight.type, None) validator = validators[insight.type] insight_deleted = delete_invalid_insight(insight, validator=validator, product=product) if insight_deleted: logger.info("Insight {} deleted (type: {})".format( insight.id, insight.type))
def refresh_insights(with_deletion: bool = False): deleted = 0 updated = 0 product_store = CACHED_PRODUCT_STORE.get() datetime_threshold = datetime.datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) dataset_datetime = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_MIN_DATASET_PATH)) if dataset_datetime.date() != datetime_threshold.date(): logger.warn( "Dataset version is not up to date, aborting insight removal job") return validators: Dict[str, InsightValidator] = {} with db: with db.atomic(): for insight in (ProductInsight.select().where( ProductInsight.annotation.is_null(), ProductInsight.timestamp <= datetime_threshold, ProductInsight.server_domain == settings.OFF_SERVER_DOMAIN, ).iterator()): product: Product = product_store[insight.barcode] if product is None: if with_deletion: # Product has been deleted from OFF logger.info("Product with barcode {} deleted" "".format(insight.barcode)) deleted += 1 insight.delete_instance() else: if insight.type not in validators: validators[ insight.type] = InsightValidatorFactory.create( insight.type, product_store) validator = validators[insight.type] insight_deleted = delete_invalid_insight( insight, validator) if insight_deleted: deleted += 1 logger.info( "invalid insight {} (type: {}), deleting..." "".format(insight.id, insight.type)) continue insight_updated = update_insight_attributes( product, insight) if insight_updated: updated += 1 logger.info("{} insights deleted".format(deleted)) logger.info("{} insights updated".format(updated))
def get_insights(barcode: Optional[str] = None, keep_types: List[str] = None, country: str = None, brands: List[str] = None, count=25) -> Iterable[ProductInsight]: where_clauses = [ ProductInsight.annotation.is_null(), ] if barcode: where_clauses.append(ProductInsight.barcode == barcode) if keep_types: where_clauses.append(ProductInsight.type.in_(keep_types)) if country is not None: where_clauses.append(ProductInsight.countries.contains( country)) if brands: where_clauses.append(ProductInsight.brands.contains_any( brands)) query = (ProductInsight.select() .where(*where_clauses) .limit(count) .order_by(peewee.fn.Random())) return query.iterator()
def process_product_insights(self, barcode: str, insights: List[JSONType]) \ -> Iterable[JSONType]: label_seen: Set[str] = set() for t in (ProductInsight.select(ProductInsight.value_tag).where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode)).iterator(): label_seen.add(t.value_tag) for insight in insights: barcode = insight['barcode'] content = insight['content'] label_tag = content['label_tag'] if not self.is_valid(barcode, label_tag, label_seen): continue source = insight['source'] automatic_processing = content.pop('automatic_processing', None) insert = { 'value_tag': label_tag, 'source_image': source, 'data': { 'source': source, **content } } if automatic_processing is not None: insert['automatic_processing'] = automatic_processing yield insert label_seen.add(label_tag)
def process_product_insights(self, barcode: str, insights: List[JSONType]) \ -> Iterable[JSONType]: if len(insights) > 1: logger.info("{} distinct expiration dates found for product " "{}, aborting import".format(len(insights), barcode)) return if ProductInsight.select().where( ProductInsight.type == self.get_type(), ProductInsight.barcode == barcode).count(): return for insight in insights: content = insight['content'] if not self.is_valid(barcode): continue source = insight['source'] yield { 'source_image': source, 'data': { 'source': source, 'notify': content['notify'], **content } } break
def process_insights( self, data: Iterable[ProductInsights], server_domain: str, automatic: bool = False, ) -> Iterator[Insight]: seen_set: Set[Tuple[str, str]] = set( (x.barcode, x.data["lang"]) for x in ProductInsight.select( ProductInsight.barcode, ProductInsight.data).where( ProductInsight.type == self.get_type(), ProductInsight.server_domain == server_domain, ProductInsight.annotation.is_null(True), ).iterator()) for product_insights in data: barcode = product_insights.barcode for insight in product_insights.insights: lang = insight.data["lang"] key = (barcode, lang) if key not in seen_set: seen_set.add(key) else: continue yield Insight.from_raw_insight(insight, product_insights, latent=False)