Exemple #1
0
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        'en:france').filter_nonempty_text_field(
            'ingredients_text_fr').filter_by_state_tag('en:complete').iter())
    product_iter = (p for p in product_iter
                    if 'ingredients-unknown-score-above-0' not in p.get(
                        'quality_tags', []))

    data = ((product['code'], {
        'ingredients_text_fr':
        normalize_ingredient_list(product['ingredients_text_fr'])
    }) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
Exemple #2
0
def category_export():
    logger.info("Starting category export to Elasticsearch...")
    client = get_es_client()
    category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name)
    logger.info("Deleting existing categories...")
    delete_categories(client)
    logger.info("Starting export...")
    category_data = generate_category_data(category_taxonomy)
    rows_inserted = perform_export(client, category_data,
                                   settings.ELASTICSEARCH_CATEGORY_INDEX)
    logger.info("%d rows inserted" % rows_inserted)
Exemple #3
0
    def export_index_data(self, index: str) -> int:
        """Given the index to export data for, this function removes existing data and exports a newer version.

        .. warning: right now, we delete then recreate the index.
           This means that as this method runs,
           some request might be silently handled erroneously (with a partial index).
           This is not a problem right now, as we don't have *real-time* requests,
           but only async ones for categories.

        Returns the number of rows inserted into the index."""
        logger.info(f"Deleting existing {index} data...")
        self._delete_existing_data(index)

        index_data = self._get_data(index)

        logger.info(f"Starting {index} export to Elasticsearch...")

        rows_inserted = perform_export(self.es_client, index_data, index)

        logger.info(f"Inserted %d rows for index {index}", rows_inserted)
        return rows_inserted
Exemple #4
0
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        "en:france").filter_nonempty_text_field(
            "ingredients_text_fr").filter_by_state_tag("en:complete").iter())
    product_iter = (p for p in product_iter
                    if "ingredients-unknown-score-above-0" not in p.get(
                        "quality_tags", []))

    data = ((
        product["code"],
        {
            "ingredients_text_fr":
            normalize_ingredient_list(product["ingredients_text_fr"])
        },
    ) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    inserted = perform_export(es_client, data,
                              settings.ELASTICSEARCH_PRODUCT_INDEX)
    logger.info("{} rows inserted".format(inserted))