def predict_from_dataset( dataset: ProductDataset, from_datetime: Optional[datetime.datetime] = None ) -> Iterable[JSONType]: """Return an iterable of category insights, using the provided dataset. Args: dataset: a ProductDataset from_datetime: datetime threshold: only keep products modified after `from_datetime` """ product_stream = ( dataset.stream().filter_nonempty_text_field("code"). filter_nonempty_text_field("product_name").filter_empty_tag_field( "categories_tags").filter_nonempty_tag_field( "countries_tags").filter_nonempty_tag_field("languages_codes")) if from_datetime: product_stream = product_stream.filter_by_modified_datetime( from_t=from_datetime) product_iter = product_stream.iter() logger.info("Performing prediction on products without categories") es_client = get_es_client() yield from predict_from_iterable(es_client, product_iter)
def init_elasticsearch(index: bool, data: bool, product: bool, category: bool, product_version: str): import orjson from robotoff import settings from robotoff.utils.es import get_es_client from robotoff.elasticsearch.product.dump import product_export from robotoff.elasticsearch.category.dump import category_export if index: with settings.ELASTICSEARCH_PRODUCT_INDEX_CONFIG_PATH.open( "rb") as f: product_index_config = orjson.loads(f.read()) with settings.ELASTICSEARCH_CATEGORY_INDEX_CONFIG_PATH.open( "rb") as f: category_index_config = orjson.loads(f.read()) client = get_es_client() if product: client.indices.create(product_version, product_index_config) if category: client.indices.create("category", category_index_config) if data: if product: product_export(version=product_version) if category: category_export()
def init_elasticsearch( load_index: bool = False, load_data: bool = True, to_load: Optional[List[str]] = None, ) -> None: """ This command is used for manual insertion of the Elasticsearch data and/or indexes for products and categorties. to_load specifies which indexes/data should be loaded - supported values are in robotoff.settings.ElasticsearchIndex. """ from robotoff.elasticsearch.export import ElasticsearchExporter from robotoff.settings import ElasticsearchIndex from robotoff.utils import get_logger from robotoff.utils.es import get_es_client logger = get_logger() es_exporter = ElasticsearchExporter(get_es_client()) if not to_load: return for item in to_load: if item not in ElasticsearchIndex.SUPPORTED_INDICES: logger.error(f"Skipping over unknown Elasticsearch type: '{item}'") continue if load_index: es_exporter.load_index(item, ElasticsearchIndex.SUPPORTED_INDICES[item]) if load_data: es_exporter.export_index_data(item)
def init_elasticsearch(index: bool, data: bool, product: bool, category: bool): import json from robotoff import settings from robotoff.utils.es import get_es_client from robotoff.elasticsearch.product.dump import product_export from robotoff.elasticsearch.category.dump import category_export if index: with settings.ELASTICSEARCH_PRODUCT_INDEX_CONFIG_PATH.open( 'r') as f: product_index_config = json.load(f) with settings.ELASTICSEARCH_CATEGORY_INDEX_CONFIG_PATH.open( 'r') as f: category_index_config = json.load(f) client = get_es_client() if product: client.indices.create('product', product_index_config) if category: client.indices.create('category', category_index_config) if data: if product: product_export() if category: category_export()
def test_load_index_already_exists(mocker): mocker.patch("elasticsearch.client.IndicesClient.exists", return_value=True) create_call = mocker.patch("elasticsearch.client.IndicesClient.create") exporter = ElasticsearchExporter(get_es_client()) exporter.load_index("category", "filepath/") create_call.assert_not_called()
def generate_spellcheck_insights(output: str, confidence: float): from robotoff.utils import dump_jsonl from robotoff.utils.es import get_es_client from robotoff.ingredients import generate_insights from robotoff.utils import get_logger get_logger() client = get_es_client() insights_iter = generate_insights(client, confidence=confidence) dump_jsonl(output, insights_iter)
def _refresh_elasticsearch(): logger.info("Refreshing Elasticsearch data") es_client = get_es_client() exporter = ElasticsearchExporter(es_client) for index, config_path in settings.ElasticsearchIndex.SUPPORTED_INDICES.items( ): exporter.load_index(index, config_path) exporter.export_index_data(index)
def test_load_index(mocker): mocker.patch("elasticsearch.client.IndicesClient.exists", return_value=False) create_call = mocker.patch("elasticsearch.client.IndicesClient.create") exporter = ElasticsearchExporter(get_es_client()) with patch("builtins.open", mocker.mock_open(read_data='{"a":"b"}')): exporter.load_index("category", "filepath/") create_call.assert_called_once()
def category_export(): logger.info("Starting category export to Elasticsearch...") client = get_es_client() category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name) logger.info("Deleting existing categories...") delete_categories(client) logger.info("Starting export...") category_data = generate_category_data(category_taxonomy) rows_inserted = perform_export(client, category_data, settings.ELASTICSEARCH_CATEGORY_INDEX) logger.info("%d rows inserted" % rows_inserted)
def test_spellcheck(text: str, confidence: float): import json from robotoff.utils.es import get_es_client from robotoff.spellcheck import Spellchecker from robotoff.utils import get_logger get_logger() client = get_es_client() result = Spellchecker.load(client=client, confidence=confidence).predict_insight( text, detailed=True) print(json.dumps(result, indent=5))
def test_export_category_index_data(mocker): del_by_query = mocker.patch( "robotoff.elasticsearch.export.Elasticsearch.delete_by_query", return_value={"deleted": 10}, ) bulk_insert = mocker.patch( "robotoff.utils.es.elasticsearch.Elasticsearch.bulk") mocker.patch( "robotoff.elasticsearch.category.dump.get_taxonomy", return_value=_category_taxonomy(), ) exporter = ElasticsearchExporter(get_es_client()) inserted = exporter.export_index_data("category") del_by_query.assert_called_once() bulk_insert.assert_called_once() assert inserted == 1
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field( 'ingredients_text_fr').filter_by_state_tag('en:complete').iter()) product_iter = (p for p in product_iter if 'ingredients-unknown-score-above-0' not in p.get( 'quality_tags', [])) data = ((product['code'], { 'ingredients_text_fr': normalize_ingredient_list(product['ingredients_text_fr']) }) for product in product_iter) logger.info("Importing products") es_client = get_es_client() perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
def generate_spellcheck_insights( output: str, index_name: str = "product_all", confidence: float = 0.5, max_errors: Optional[int] = None, limit: Optional[int] = None, ) -> None: from robotoff.spellcheck import Spellchecker from robotoff.utils import dump_jsonl, get_logger from robotoff.utils.es import get_es_client logger = get_logger() logger.info("Max errors: {}".format(max_errors)) client = get_es_client() insights_iter = Spellchecker.load(client=client, confidence=confidence, index_name=index_name).generate_insights( max_errors=max_errors, limit=limit) dump_jsonl(output, insights_iter)
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_nonempty_text_field( "ingredients_text_fr").filter_by_state_tag("en:complete").iter()) product_iter = (p for p in product_iter if "ingredients-unknown-score-above-0" not in p.get( "quality_tags", [])) data = (( product["code"], { "ingredients_text_fr": normalize_ingredient_list(product["ingredients_text_fr"]) }, ) for product in product_iter) logger.info("Importing products") es_client = get_es_client() inserted = perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX) logger.info("{} rows inserted".format(inserted))
def predict_from_product(product: Dict) -> Optional[Dict]: client = get_es_client() return predict(client, product)
def match(client, query: str, lang: str): body = generate_request(query, lang) return client.search( index=settings.ELASTICSEARCH_CATEGORY_INDEX, doc_type=settings.ELASTICSEARCH_TYPE, body=body, _source=True, ) def generate_request(query: str, lang: str): return { "query": {"match_phrase": {"{}:name.stemmed".format(lang): {"query": query,}}} } def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("query", help="query to search") parser.add_argument("--lang", help="language of the query", default="fr") return parser.parse_args() if __name__ == "__main__": args = parse_args() es_client = get_es_client() results = match(es_client, args.query, args.lang) print(json.dumps(results["hits"], indent=4))