def normalize_instruction(instruction_doc, yummly_ingredients_set,
                          instruction_normalizer: RecipeNormalizer):
    normalized_instruction = ''
    for idx, word in enumerate(instruction_doc):
        if not word.is_punct:  # we want a space before all non-punctuation words
            space = ' '
        else:
            space = ''
        if word.tag_ in ['NN', 'NNS', 'NNP', 'NOUN', 'NNPS']:
            normalized_instruction += space + instruction_normalizer.lemmatize_token_to_str(
                token=word, token_tag='NOUN')
        else:
            normalized_instruction += space + word.text

    normalized_instruction = normalized_instruction.strip()

    normalized_instruction_tokens = re.findall(r"[\w'-]+|[.,!?; ]",
                                               normalized_instruction)
    # find all sublists of tokens with descending length
    for n in range(
            8, 1, -1
    ):  # stop at 2 because matching tokens with length 1 can stay as they are
        match = True
        while match:
            normalized_instruction_tokens, match = match_ingredients(
                normalized_instruction_tokens, yummly_ingredients_set, n)

    return ''.join(normalized_instruction_tokens)
Esempio n. 2
0
 def __init__(self, meat_ingredients=[]):
     self.mlm_model = MLMModel()
     self.recipe_normalizer = RecipeNormalizer(lemmatization_types=['NOUN'])
     with open("Foodbert/data/cleaned_yummly_ingredients.json") as f:
         ingredients_yummly = json.load(f)
     self.ingredients_yummly_set = {
         tuple(ing.split(' '))
         for ing in ingredients_yummly
     }
     substitutes_path = Path(
         "Foodbert/foodbert_embeddings/data/substitutes_embeddings_high_recall.json"
     )
     with substitutes_path.open() as f:
         self.all_predicted_subtitutes = {
             tuple(elem)
             for elem in json.load(f)
         }
     self.meat_ingredients = meat_ingredients
Esempio n. 3
0
def normalize_reviews(all_sentences):
    with open("data/cleaned_yummly_ingredients.json") as f:
        ingredients_yummly = json.load(f)
    ingredients_yummly_set = {
        tuple(ing.split(' '))
        for ing in ingredients_yummly
    }

    review_normalizer = RecipeNormalizer()
    normalized_reviews_docs = review_normalizer.model.pipe(all_sentences,
                                                           n_process=-1,
                                                           batch_size=1000)
    normalized_reviews = []

    for normalized_review_doc in tqdm(normalized_reviews_docs,
                                      desc='Normalizing Reviews'):
        normalized_review = normalize_instruction(
            normalized_review_doc,
            ingredients_yummly_set,
            instruction_normalizer=review_normalizer)
        normalized_reviews.append(normalized_review)

    return normalized_reviews
Esempio n. 4
0
from Foodbert.normalisation.helpers.recipe_normalizer import RecipeNormalizer

if __name__ == '__main__':
    recipe_normalizer = RecipeNormalizer(lemmatization_types=['NOUN'])
    with open('meats.txt') as f:
        meat_ingredients = f.read().splitlines()

    meat_ingredients_normalized = recipe_normalizer.normalize_ingredients(meat_ingredients, strict=False)
    for idx, elem in enumerate(meat_ingredients_normalized):
        meat_ingredients_normalized[idx] = '_'.join(elem.split(' '))

    with open('data/meats_normalized.txt', 'w') as f:
        for element in meat_ingredients_normalized:
            f.write(element + "\n")
Esempio n. 5
0
class ContextualizedSubstitutes:
    def __init__(self, meat_ingredients=[]):
        self.mlm_model = MLMModel()
        self.recipe_normalizer = RecipeNormalizer(lemmatization_types=['NOUN'])
        with open("Foodbert/data/cleaned_yummly_ingredients.json") as f:
            ingredients_yummly = json.load(f)
        self.ingredients_yummly_set = {
            tuple(ing.split(' '))
            for ing in ingredients_yummly
        }
        substitutes_path = Path(
            "Foodbert/foodbert_embeddings/data/substitutes_embeddings_high_recall.json"
        )
        with substitutes_path.open() as f:
            self.all_predicted_subtitutes = {
                tuple(elem)
                for elem in json.load(f)
            }
        self.meat_ingredients = meat_ingredients

    def _clean_modified_string(self, original, modified):
        a = [' '] + original.split()
        b = [' '] + modified.split()
        final_string = ""
        matching_blocks = list(
            difflib.SequenceMatcher(None, a, b).get_matching_blocks())
        all_block_tuples = []
        for idx, matching_block in enumerate(matching_blocks):
            all_block_tuples.append(
                (b[matching_block.b:matching_block.b + matching_block.size],
                 b[matching_block.b:matching_block.b + matching_block.size]))
            if idx < len(matching_blocks) - 1:
                next_match = matching_blocks[idx + 1]
                all_block_tuples.append(
                    (a[matching_block.a + matching_block.size:next_match.a],
                     b[matching_block.b + matching_block.size:next_match.b]))

        for original, changed in all_block_tuples:
            if 'fish' in changed:
                final_string += ' '.join(changed) + ' '
            else:
                final_string += ' '.join(original) + ' '

        final_string = final_string.strip()
        return final_string

    def meat_free(self, ingr):
        for meat in self.meat_ingredients:
            if meat in ingr and not 'substitute' in ingr or ingr == 'vegetable':
                return False
        return True

    def _generate_substitutes_in_sentence(self, sentence):
        '''
        :return: list of lists (one per occurrence) of substitutes
        '''
        # normalize sentence and ingredient_name
        sentence = self.recipe_normalizer.model.pipe([sentence],
                                                     n_process=1,
                                                     batch_size=1)
        sentence = normalize_instruction(
            instruction_doc=next(sentence),
            yummly_ingredients_set=self.ingredients_yummly_set,
            instruction_normalizer=self.recipe_normalizer)

        # find all meats
        meats = []
        meats_with_substitutes = {}
        sentence_to_split = self.recipe_normalizer.model.pipe([sentence],
                                                              n_process=1,
                                                              batch_size=1)
        for word in next(sentence_to_split):
            for meat in self.meat_ingredients:
                if meat == word.norm_ and 'substitute' not in word.norm_:
                    meats.append(word.norm_)

        # find replacement for all meats
        for ingredient_to_replace in set(meats):
            ingredient_to_replace = self.recipe_normalizer.normalize_ingredients(
                [ingredient_to_replace], strict=False)[0]
            ingredient_to_replace = '_'.join(ingredient_to_replace.split(' '))

            ordered_substitutes = self.mlm_model.predict_substitutes(
                sentence=sentence,
                ingredient_name=ingredient_to_replace,
                with_masking=True)

            ingredient_substitutes = {
                '_'.join(elem[1].split())
                for elem in self.all_predicted_subtitutes
                if elem[0] == ' '.join(ingredient_to_replace.split('_'))
            }

            all_ordered_substitutes = []
            for occurence_substitutes in ordered_substitutes:
                occurence_substitutes_filtered = {
                    ingr: score
                    for ingr, score in zip(occurence_substitutes[0],
                                           occurence_substitutes[1])
                    if ingr in ingredient_substitutes and self.meat_free(ingr)
                }
                if len(occurence_substitutes_filtered) == 0:
                    occurence_substitutes_filtered = {
                        ingr: score
                        for ingr, score in zip(occurence_substitutes[0],
                                               occurence_substitutes[1])
                        if self.meat_free(ingr)
                    }
                all_ordered_substitutes.append(occurence_substitutes_filtered)

            meats_with_substitutes[
                ingredient_to_replace] = all_ordered_substitutes

        return meats_with_substitutes

    def generate_substitute_in_text(self, text):
        with open('Foodbert/foodbert/data/ingredient_counts.json') as f:
            ingredients_counts = json.load(f)
            ingredients_counts = dict(ingredients_counts)
        sentences = split_reviews_to_sentences([text])
        predictions_per_meat = defaultdict(list)
        for sentence in sentences:
            substitutes_per_meat = self._generate_substitutes_in_sentence(
                sentence)
            for key in substitutes_per_meat:
                for prediction in substitutes_per_meat[key]:
                    predictions_per_meat[key].append(prediction)

        joined_predictions_per_meat = {}
        for key1 in predictions_per_meat:
            joined_substitutes = defaultdict(float)
            for prediction_dict in predictions_per_meat[key1]:
                for key2, value in prediction_dict.items():
                    joined_substitutes[key2] += value * calculate_weight(
                        key, ingredients_counts)
            joined_substitutes = sorted(joined_substitutes,
                                        key=joined_substitutes.get,
                                        reverse=True)
            joined_predictions_per_meat[key1] = joined_substitutes

        return joined_predictions_per_meat

    def generate_substitutes_in_recipe(self,
                                       recipe_text,
                                       prefix="",
                                       suffix=""):
        substitutes_per_meat = self.generate_substitute_in_text(recipe_text)

        # normalize sentence and ingredient_name
        recipe_text = self.recipe_normalizer.model.pipe([recipe_text],
                                                        n_process=-1,
                                                        batch_size=1)
        recipe_text = normalize_instruction(
            instruction_doc=next(recipe_text),
            yummly_ingredients_set=self.ingredients_yummly_set,
            instruction_normalizer=self.recipe_normalizer)

        substitute_dict = {}
        for ingredient_to_replace, substitutes in substitutes_per_meat.items():
            substitute = substitutes[0]
            ingredient_to_replace = self.recipe_normalizer.normalize_ingredients(
                [ingredient_to_replace], strict=False)[0]
            ingredient_to_replace = '_'.join(ingredient_to_replace.split(' '))

            # replace occurences
            recipe_text = recipe_text.replace('!', ' !').replace(
                '?', ' ?').replace('.', ' .').replace(':',
                                                      ' :').replace(',', ' ,')
            recipe_text = ' ' + recipe_text + ' '
            recipe_text = recipe_text.replace(
                f' {ingredient_to_replace} ',
                f' {prefix + substitute + suffix} ')
            recipe_text = recipe_text.replace(' !', '!').replace(
                ' ?', '?').replace(' .', '.').replace(' :',
                                                      ':').replace(' ,', ',')
            ingredient_to_replace = ingredient_to_replace.replace('_', ' ')
            substitute = substitute.replace('_', ' ')
            substitute_dict[ingredient_to_replace] = substitute

        recipe_text = recipe_text.replace('_', ' ')
        return recipe_text.strip(), substitute_dict
Esempio n. 6
0
 def __init__(self):
     with open('data/meats_normalized.txt') as f:
         self.meat_ingredients = f.read().splitlines()
     self.cs = ContextualizedSubstitutes(
         meat_ingredients=self.meat_ingredients)
     self.instruction_normalizer = RecipeNormalizer()
    with open("data/cleaned_yummly_ingredients.json") as f:
        ingredients_yummly = json.load(f)
    ingredients_yummly_set = {
        tuple(ing.split(' '))
        for ing in ingredients_yummly
    }

    with recipe1m_json_path.open() as f:
        recipes = json.load(f)

    instruction_lists = [recipe['instructions'] for recipe in recipes]
    instructions = []
    for instruction_list in instruction_lists:
        for instruction in instruction_list:
            instructions.append(instruction['text'])
    instruction_normalizer = RecipeNormalizer()
    normalized_instructions = instruction_normalizer.model.pipe(
        instructions, n_process=-1, batch_size=1000)

    for recipe in tqdm(recipes, total=len(recipes)):
        for instruction_dict in recipe['instructions']:
            normalized_instruction = normalize_instruction(
                next(normalized_instructions),
                ingredients_yummly_set,
                instruction_normalizer=instruction_normalizer)
            instruction_dict['text'] = normalized_instruction

    with export_path.open('w') as f:
        json.dump(recipes, f)