def stream(self) -> ProductStream: json_path_str = str(self.jsonl_path) if json_path_str.endswith(".gz"): iterator = gzip_jsonl_iter(json_path_str) else: iterator = jsonl_iter(json_path_str) return ProductStream(iterator)
def iter_product(data_path: pathlib.Path): for product in gzip_jsonl_iter(data_path): product.pop("images", None) if "nutriments" in product: nutriments = product["nutriments"] or {} for key in list(nutriments.keys()): if key not in NUTRIMENTS: nutriments.pop(key) yield product
model = keras.models.load_model(str(model_path)) analysis_model = generate_analysis_model(model, "dense") generate_data_partial = functools.partial( generate_data_from_df, ingredient_to_id=ingredient_to_id, category_to_id=category_to_id, product_name_max_length=config.model_config.product_name_max_length, product_name_token_to_int=product_name_vocabulary, nlp=nlp, product_name_preprocessing_config=config.product_name_preprocessing_config, nutriment_input=config.model_config.nutriment_input, ) val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)) category_taxonomy: Taxonomy = Taxonomy.from_json( settings.CATEGORY_TAXONOMY_PATH) val_df["deepest_categories"] = get_deepest_categories(category_taxonomy, val_df.categories_tags) X_val, y_val = generate_data_partial(val_df) y_pred_val = model.predict(X_val) val_df["predicted_deepest_categories"] = get_deepest_categories( category_taxonomy, [[category_names[i] for i, conf in enumerate(y) if conf >= 0.5] for y in y_pred_val],
def generate_X(df: pd.DataFrame, ingredient_to_id: Dict, vectorizer: CountVectorizer): product_name_matrix = vectorizer.transform(df.product_name) ingredient_matrix = process_ingredients(df.known_ingredient_tags, ingredient_to_id) return np.concatenate((product_name_matrix.toarray(), ingredient_matrix), axis=1) category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) ingredient_taxonomy = Taxonomy.from_json(settings.INGREDIENTS_TAXONOMY_PATH) CATEGORY_NAMES = sorted(category_taxonomy.keys()) INGREDIENT_NAMES = sorted(ingredient_taxonomy.keys()) CATEGORY_TO_ID = {name: idx for idx, name in enumerate(CATEGORY_NAMES)} INGREDIENT_TO_ID = {name: idx for idx, name in enumerate(INGREDIENT_NAMES)} train_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TRAIN_PATH)).head(1000) test_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TEST_PATH)).head(100) val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)).head(100) count_vectorizer = CountVectorizer(min_df=5, preprocessor=preprocess_product_name) count_vectorizer.fit(train_df.product_name) X_train, y_train = process_df( train_df, CATEGORY_TO_ID, INGREDIENT_TO_ID, count_vectorizer ) X_test, y_test = process_df(test_df, CATEGORY_TO_ID, INGREDIENT_TO_ID, count_vectorizer) X_val, y_val = process_df(val_df, CATEGORY_TO_ID, INGREDIENT_TO_ID, count_vectorizer) clf = RandomForestClassifier(n_estimators=10) clf.fit(X_train, y_train)