Exemple #1
0
    def stream(self) -> ProductStream:
        json_path_str = str(self.jsonl_path)

        if json_path_str.endswith(".gz"):
            iterator = gzip_jsonl_iter(json_path_str)
        else:
            iterator = jsonl_iter(json_path_str)

        return ProductStream(iterator)
def iter_product(data_path: pathlib.Path):
    for product in gzip_jsonl_iter(data_path):
        product.pop("images", None)

        if "nutriments" in product:
            nutriments = product["nutriments"] or {}
            for key in list(nutriments.keys()):
                if key not in NUTRIMENTS:
                    nutriments.pop(key)

        yield product
model = keras.models.load_model(str(model_path))

analysis_model = generate_analysis_model(model, "dense")

generate_data_partial = functools.partial(
    generate_data_from_df,
    ingredient_to_id=ingredient_to_id,
    category_to_id=category_to_id,
    product_name_max_length=config.model_config.product_name_max_length,
    product_name_token_to_int=product_name_vocabulary,
    nlp=nlp,
    product_name_preprocessing_config=config.product_name_preprocessing_config,
    nutriment_input=config.model_config.nutriment_input,
)

val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH))

category_taxonomy: Taxonomy = Taxonomy.from_json(
    settings.CATEGORY_TAXONOMY_PATH)

val_df["deepest_categories"] = get_deepest_categories(category_taxonomy,
                                                      val_df.categories_tags)

X_val, y_val = generate_data_partial(val_df)

y_pred_val = model.predict(X_val)

val_df["predicted_deepest_categories"] = get_deepest_categories(
    category_taxonomy,
    [[category_names[i] for i, conf in enumerate(y) if conf >= 0.5]
     for y in y_pred_val],
Exemple #4
0
def generate_X(df: pd.DataFrame, ingredient_to_id: Dict, vectorizer: CountVectorizer):
    product_name_matrix = vectorizer.transform(df.product_name)
    ingredient_matrix = process_ingredients(df.known_ingredient_tags, ingredient_to_id)
    return np.concatenate((product_name_matrix.toarray(), ingredient_matrix), axis=1)


category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
ingredient_taxonomy = Taxonomy.from_json(settings.INGREDIENTS_TAXONOMY_PATH)

CATEGORY_NAMES = sorted(category_taxonomy.keys())
INGREDIENT_NAMES = sorted(ingredient_taxonomy.keys())

CATEGORY_TO_ID = {name: idx for idx, name in enumerate(CATEGORY_NAMES)}
INGREDIENT_TO_ID = {name: idx for idx, name in enumerate(INGREDIENT_NAMES)}

train_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TRAIN_PATH)).head(1000)
test_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TEST_PATH)).head(100)
val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)).head(100)

count_vectorizer = CountVectorizer(min_df=5, preprocessor=preprocess_product_name)
count_vectorizer.fit(train_df.product_name)

X_train, y_train = process_df(
    train_df, CATEGORY_TO_ID, INGREDIENT_TO_ID, count_vectorizer
)
X_test, y_test = process_df(test_df, CATEGORY_TO_ID, INGREDIENT_TO_ID, count_vectorizer)
X_val, y_val = process_df(val_df, CATEGORY_TO_ID, INGREDIENT_TO_ID, count_vectorizer)

clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)