Beispiel #1
0
def load_taxonomy(model_dir: pathlib.Path) -> Taxonomy:
    return Taxonomy.from_json(model_dir / CATEGORY_TAXONOMY_NAME)
Beispiel #2
0
from typing import List

import pytest

from robotoff import settings
from robotoff.taxonomy import Taxonomy

label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH)


class TestTaxonomy:
    @pytest.mark.parametrize('taxonomy,item,candidates,output', [
        (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True),
        (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False),
        (label_taxonomy, 'en:fr-bio-01', [], False),
        (label_taxonomy, 'en:organic', {'en:gluten-free'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True),
    ])
    def test_is_child_of_any(self, taxonomy: Taxonomy, item: str,
                             candidates: List, output: bool):
        assert taxonomy.is_parent_of_any(item, candidates) is output

    def test_is_child_of_any_unknwon_item(self):
        with pytest.raises(ValueError):
            label_taxonomy.is_parent_of_any("unknown-id", set())
        print("Product {} not found".format(barcode))
        continue

    X = generate_data(
        product=product,
        ingredient_to_id=ingredient_to_id,
        product_name_token_to_int=product_name_vocabulary,
        nlp=nlp,
        product_name_max_length=config.model_config.product_name_max_length,
        product_name_preprocessing_config=config.
        product_name_preprocessing_config,
    )

    y_pred = model.predict(X)
    y_pred_int = (y_pred > 0.5).astype(y_pred.dtype)
    taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
    y_pred_int_filled = fill_ancestors(y_pred_int,
                                       taxonomy=taxonomy,
                                       category_names=category_names)

    predicted_categories_ids = y_pred_int_filled[0].nonzero()[0]
    predicted_categories = [
        category_names[id_] for id_ in predicted_categories_ids
    ]

    predicted = []
    for predicted_category_id, predicted_category in zip(
            predicted_categories_ids, predicted_categories):
        confidence = y_pred[0, predicted_category_id]
        predicted.append((predicted_category, confidence))
def main():
    args = parse_args()
    config: Config = get_config(args)
    model_config = config.model_config

    output_dir = args.output_dir
    output_dir.mkdir(parents=True, exist_ok=True)

    category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
    ingredient_taxonomy = Taxonomy.from_json(
        settings.INGREDIENTS_TAXONOMY_PATH)

    train_df = create_dataframe("train", args.lang)
    test_df = create_dataframe("test", args.lang)
    val_df = create_dataframe("val", args.lang)

    categories_count = count_categories(train_df)
    ingredients_count = count_ingredients(train_df)

    selected_categories = set((cat for cat, count in categories_count.items()
                               if count >= config.category_min_count))
    selected_ingredients = set(
        (ingredient for ingredient, count in ingredients_count.items()
         if count >= config.ingredient_min_count))
    print("{} categories selected".format(len(selected_categories)))
    print("{} ingredients selected".format(len(selected_ingredients)))

    category_names = [
        x for x in sorted(category_taxonomy.keys()) if x in selected_categories
    ]

    ingredient_names = [
        x for x in sorted(ingredient_taxonomy.keys())
        if x in selected_ingredients
    ]

    category_to_id = {name: idx for idx, name in enumerate(category_names)}
    ingredient_to_id = {name: idx for idx, name in enumerate(ingredient_names)}

    nlp = get_nlp(lang=config.lang)

    preprocess_product_name_func = functools.partial(
        preprocess_product_name,
        lower=config.product_name_preprocessing_config.lower,
        strip_accent=config.product_name_preprocessing_config.strip_accent,
        remove_punct=config.product_name_preprocessing_config.remove_punct,
        remove_digit=config.product_name_preprocessing_config.remove_digit,
    )
    preprocessed_product_names_iter = (
        preprocess_product_name_func(product_name)
        for product_name in train_df.product_name)
    train_tokens_iter = tokenize_batch(preprocessed_product_names_iter, nlp)
    product_name_to_int = extract_vocabulary(train_tokens_iter,
                                             config.product_name_min_count)

    model_config.ingredient_voc_size = len(ingredient_to_id)
    model_config.output_dim = len(category_to_id)
    model_config.product_name_voc_size = len(product_name_to_int)

    print("Selected vocabulary: {}".format(len(product_name_to_int)))

    generate_data_partial = functools.partial(
        generate_data_from_df,
        ingredient_to_id=ingredient_to_id,
        category_to_id=category_to_id,
        product_name_max_length=model_config.product_name_max_length,
        product_name_token_to_int=product_name_to_int,
        nlp=nlp,
        product_name_preprocessing_config=config.
        product_name_preprocessing_config,
        nutriment_input=config.model_config.nutriment_input,
    )

    replicates = args.repeat
    if replicates == 1:
        save_dirs = [output_dir]
    else:
        save_dirs = [output_dir / str(i) for i in range(replicates)]

    for i, save_dir in enumerate(save_dirs):
        model = create_model(config)
        save_dir.mkdir(exist_ok=True)
        config.train_config.start_datetime = str(datetime.datetime.utcnow())
        print("Starting training repeat {}".format(i))
        save_product_name_vocabulary(product_name_to_int, save_dir)
        save_config(config, save_dir)
        copy_category_taxonomy(settings.CATEGORY_TAXONOMY_PATH, save_dir)
        save_category_vocabulary(category_to_id, save_dir)
        save_ingredient_vocabulary(ingredient_to_id, save_dir)

        X_train, y_train = generate_data_partial(train_df)
        X_val, y_val = generate_data_partial(val_df)
        X_test, y_test = generate_data_partial(test_df)

        train(
            (X_train, y_train),
            (X_val, y_val),
            (X_test, y_test),
            model,
            save_dir,
            config,
            category_taxonomy,
            category_names,
        )

        config.train_config.end_datetime = str(datetime.datetime.utcnow())
        save_config(config, save_dir)
        config.train_config.start_datetime = None
        config.train_config.end_datetime = None
Beispiel #5
0
from typing import List, Set

import pytest

from robotoff import settings
from robotoff.taxonomy import Taxonomy


label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH)
category_taxonomy = Taxonomy.from_json(settings.TAXONOMY_CATEGORY_PATH)


class TestTaxonomy:
    @pytest.mark.parametrize('taxonomy,item,candidates,output', [
        (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True),
        (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False),
        (label_taxonomy, 'en:fr-bio-01', [], False),
        (label_taxonomy, 'en:organic', {'en:gluten-free'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True),
    ])
    def test_is_child_of_any(self, taxonomy: Taxonomy, item: str,
                             candidates: List, output: bool):
        assert taxonomy.is_parent_of_any(item, candidates) is output

    def test_is_child_of_any_unknwon_item(self):
        with pytest.raises(ValueError):
            label_taxonomy.is_parent_of_any("unknown-id", set())
Beispiel #6
0
    category_to_id: Dict,
    ingredient_to_id: Dict,
    vectorizer: CountVectorizer,
):
    y = generate_y(df.categories_tags, category_to_id)
    X = generate_X(df, ingredient_to_id, vectorizer)
    return X, y


def generate_X(df: pd.DataFrame, ingredient_to_id: Dict, vectorizer: CountVectorizer):
    product_name_matrix = vectorizer.transform(df.product_name)
    ingredient_matrix = process_ingredients(df.known_ingredient_tags, ingredient_to_id)
    return np.concatenate((product_name_matrix.toarray(), ingredient_matrix), axis=1)


category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
ingredient_taxonomy = Taxonomy.from_json(settings.INGREDIENTS_TAXONOMY_PATH)

CATEGORY_NAMES = sorted(category_taxonomy.keys())
INGREDIENT_NAMES = sorted(ingredient_taxonomy.keys())

CATEGORY_TO_ID = {name: idx for idx, name in enumerate(CATEGORY_NAMES)}
INGREDIENT_TO_ID = {name: idx for idx, name in enumerate(INGREDIENT_NAMES)}

train_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TRAIN_PATH)).head(1000)
test_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TEST_PATH)).head(100)
val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)).head(100)

count_vectorizer = CountVectorizer(min_df=5, preprocessor=preprocess_product_name)
count_vectorizer.fit(train_df.product_name)