def predict_category(output: str): from robotoff.elasticsearch.category.predict import predict_from_dataset from robotoff.utils import dump_jsonl from robotoff.products import ProductDataset from robotoff import settings dataset = ProductDataset(settings.JSONL_DATASET_PATH) dump_jsonl(output, predict_from_dataset(dataset))
def export_logo_annotation( output: pathlib.Path, server_domain: Optional[str] = None, annotated: Optional[bool] = None, ): from robotoff.models import db, LogoAnnotation, ImageModel, ImagePrediction from robotoff.utils import dump_jsonl with db: where_clauses = [] if server_domain is not None: where_clauses.append(ImageModel.server_domain == server_domain) if annotated is not None: where_clauses.append( LogoAnnotation.annotation_value.is_null(not annotated)) query = LogoAnnotation.select().join(ImagePrediction).join( ImageModel) if where_clauses: query = query.where(*where_clauses) logo_iter = query.iterator() dict_iter = (l.to_dict() for l in logo_iter) dump_jsonl(output, dict_iter)
def generate_category_insights(products: Iterable[JSONType], batch_size: int): insights = predict_from_product_batch( products, allowed_lang={lang}, filter_blacklisted=True, batch_size=batch_size ) dump_jsonl( settings.PROJECT_DIR / "category_insights.{}.jsonl".format(lang), insights )
def predict_category(output: str): from robotoff.elasticsearch.category.predict import predict_from_dataset from robotoff.utils import dump_jsonl from robotoff.products import ProductDataset from robotoff import settings dataset = ProductDataset(settings.JSONL_DATASET_PATH) insights = predict_from_dataset(dataset) dict_insights = (i.to_dict() for i in insights) dump_jsonl(output, dict_insights)
def generate_spellcheck_insights(output: str, confidence: float): from robotoff.utils import dump_jsonl from robotoff.utils.es import get_es_client from robotoff.ingredients import generate_insights from robotoff.utils import get_logger get_logger() client = get_es_client() insights_iter = generate_insights(client, confidence=confidence) dump_jsonl(output, insights_iter)
def generate_spellcheck_insights( output: str, index_name: str = "product_all", confidence: float = 0.5, max_errors: Optional[int] = None, limit: Optional[int] = None, ) -> None: from robotoff.spellcheck import Spellchecker from robotoff.utils import dump_jsonl, get_logger from robotoff.utils.es import get_es_client logger = get_logger() logger.info("Max errors: {}".format(max_errors)) client = get_es_client() insights_iter = Spellchecker.load(client=client, confidence=confidence, index_name=index_name).generate_insights( max_errors=max_errors, limit=limit) dump_jsonl(output, insights_iter)
def run(lang: Optional[str] = None): dataset = ProductDataset.load() training_stream = dataset.stream().filter_nonempty_tag_field( 'categories_tags') if lang is not None: training_stream = (training_stream.filter_text_field( 'lang', lang).filter_nonempty_text_field('product_name_{}'.format(lang))) else: training_stream = training_stream.filter_nonempty_text_field( 'product_name') dataset_iter = generate_dataset(training_stream, lang) count = dump_jsonl( settings.PROJECT_DIR / 'datasets' / 'category' / 'category_{}.jsonl'.format(lang or 'xx'), dataset_iter) print(count)
def run(lang: Optional[str] = None): logger.info("Generating category dataset for lang {}".format(lang or "xx")) dataset = ProductDataset.load() training_stream = dataset.stream().filter_nonempty_tag_field( "categories_tags") if lang is not None: training_stream = training_stream.filter_text_field( "lang", lang).filter_nonempty_text_field("product_name_{}".format(lang)) else: training_stream = training_stream.filter_nonempty_text_field( "product_name") dataset_iter = generate_dataset(training_stream, lang) count = dump_jsonl( settings.PROJECT_DIR / "datasets" / "category" / "category_{}.jsonl".format(lang or "xx"), dataset_iter, ) logger.info("{} items for lang {}".format(count, lang or "xx"))
from robotoff import settings from robotoff.products import ProductDataset from robotoff.utils import dump_jsonl, get_logger logger = get_logger() def images_dimension_iter(): dataset = ProductDataset.load() for product in dataset.stream().filter_nonempty_text_field("code"): images = product.get("images", {}) for image_id, image_data in images.items(): if not image_id.isdigit(): continue if "full" not in image_data["sizes"]: continue width = image_data["sizes"]["full"]["w"] height = image_data["sizes"]["full"]["h"] yield [int(width), int(height), product["code"], str(image_id)] dump_jsonl(settings.PROJECT_DIR / "images_dimension.jsonl", images_dimension_iter())
def dump_insights(): logger.info("Dumping insights...") insights_iter = get_insights(as_dict=True, annotated=None, limit=None) insights_iter = transform_insight_iter(insights_iter) dumped = dump_jsonl(settings.INSIGHT_DUMP_PATH, insights_iter) logger.info("Dump finished, {} insights dumped".format(dumped))
import pathlib from random import shuffle from robotoff import settings from robotoff.utils import dump_jsonl, jsonl_iter lang = "pt" input_path: pathlib.Path = (settings.DATASET_DIR / "category" / "category_{}.jsonl".format(lang)) items = list(jsonl_iter(input_path)) shuffle(items) val_count = len(items) // 10 val_items = items[:val_count] test_items = items[val_count:2 * val_count] train_items = items[2 * val_count:] dump_jsonl(input_path.with_name("category_{}.val.jsonl".format(lang)), val_items) dump_jsonl(input_path.with_name("category_{}.test.jsonl".format(lang)), test_items) dump_jsonl(input_path.with_name("category_{}.train.jsonl".format(lang)), train_items)