def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field( 'ingredients_text_fr').filter_by_state_tag('en:complete').iter()) product_iter = (p for p in product_iter if 'ingredients-unknown-score-above-0' not in p.get( 'quality_tags', [])) data = ((product['code'], { 'ingredients_text_fr': normalize_ingredient_list(product['ingredients_text_fr']) }) for product in product_iter) logger.info("Importing products") es_client = get_es_client() perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
def category_export(): logger.info("Starting category export to Elasticsearch...") client = get_es_client() category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name) logger.info("Deleting existing categories...") delete_categories(client) logger.info("Starting export...") category_data = generate_category_data(category_taxonomy) rows_inserted = perform_export(client, category_data, settings.ELASTICSEARCH_CATEGORY_INDEX) logger.info("%d rows inserted" % rows_inserted)
def export_index_data(self, index: str) -> int: """Given the index to export data for, this function removes existing data and exports a newer version. .. warning: right now, we delete then recreate the index. This means that as this method runs, some request might be silently handled erroneously (with a partial index). This is not a problem right now, as we don't have *real-time* requests, but only async ones for categories. Returns the number of rows inserted into the index.""" logger.info(f"Deleting existing {index} data...") self._delete_existing_data(index) index_data = self._get_data(index) logger.info(f"Starting {index} export to Elasticsearch...") rows_inserted = perform_export(self.es_client, index_data, index) logger.info(f"Inserted %d rows for index {index}", rows_inserted) return rows_inserted
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_nonempty_text_field( "ingredients_text_fr").filter_by_state_tag("en:complete").iter()) product_iter = (p for p in product_iter if "ingredients-unknown-score-above-0" not in p.get( "quality_tags", [])) data = (( product["code"], { "ingredients_text_fr": normalize_ingredient_list(product["ingredients_text_fr"]) }, ) for product in product_iter) logger.info("Importing products") es_client = get_es_client() inserted = perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX) logger.info("{} rows inserted".format(inserted))