Ejemplo n.º 1
0
class RecipeParser:
    recipeFiles = [
        "recepie_project/ttds-project-bbc/content.txt",
        "recepie_project_all_recipes/ttds-project/contents.txt",
        "recepie_project_epicurious/ttds-project/contents.txt",
        "recepie_project_food/ttds-project/contents.txt",
        "recipe_myrecipes.com_project/ttds-project/myrecipes_content.txt"
    ]
    # recipeFiles      = ["test_database/content_all_recipes.txt",
    #                     "test_database/content_bbc.txt",
    #                     "test_database/content_epicurious.txt",
    #                     "test_database/content_food.txt",
    #                     "test_database/content_my_recipes.txt"]
    # recipeFiles      = ["continue_to_add_from_here/food_contents.txt",
    #                     "continue_to_add_from_here/myrecipes_content.txt"]

    recipeFileHandle = None
    database = None
    ps = None
    stop_words = None
    recipe_classes = None
    recipe_id = 0

    def __init__(self, recipe_id=0):
        if recipe_id != 0:
            self.database = Database("recipes", append=True)
        else:
            self.database = Database("recipes")
        self.recipe_id = recipe_id
        self.ps = PorterStemmer()
        self.stop_words = self.GetStopWords()  #discriptive words
        self.recipe_classes = self.GetRecipeClasses()  #lables
        self.common_ing = self.GetCommonIng(
        )  #common ingredients that can be embedded e.g. "anise-flavored"
        self.common_ing_f = self.GetCommonIngFullWords(
        )  #words that need to mached fully e.g. "tea"

    def GetStopWords(self):
        return [
            stop_words.rstrip('\n') for stop_words in open(
                "recipe_stopwords.txt", 'r', encoding="utf8")
        ]

    def GetRecipeClasses(self):
        return [
            recipe_classes.rstrip('\n')
            for recipe_classes in open("classes.txt", 'r', encoding="utf-8")
        ]

    def GetCommonIng(self):
        return [
            recipe_classes.rstrip('\n')
            for recipe_classes in open("known_ing.txt", 'r', encoding="utf-8")
        ]

    def GetCommonIngFullWords(self):
        return [
            recipe_classes.rstrip('\n') for recipe_classes in open(
                "known_ing_full.txt", 'r', encoding="utf-8")
        ]

    def cleaning_ing_list(self, ingredients):
        list_of_ing = []
        # f = open("what_is_left.txt","a+")
        common_ing_list = []

        for ing in ingredients:

            #ing only words
            pattern = re.compile(r"\b[a-zA-Z-]+\b")
            ing_words = pattern.findall(ing)

            #removing ingredients that match mistakenly to other things
            short_words_present = []
            for w in ing_words:
                if w in self.common_ing_f:
                    short_words_present.append(self.ps.stem(w))
                    ing_words.remove(w)
                    # print("SHORT W: ", w)

            ing = (' '.join(ing_words).lower())
            # print("ingi: ", ing)

            #looking for ingredients that are commonly used
            common_ing_present = []
            ing_copy = ing
            what_string_left = ""

            for common_ing in self.common_ing:
                if common_ing in ing_copy:

                    ing_copy = ing_copy.replace(common_ing, ' ')
                    common_ing_present.append(self.ps.stem(common_ing))
                    what_string_left = ing_copy

            #if common ingredients list not empty
            if common_ing_present or short_words_present:

                ing_expanded = (' '.join(short_words_present +
                                         common_ing_present))
                common_ing_list.extend(short_words_present +
                                       common_ing_present)
                common_ing_list.append(ing_expanded)

                # if len(what_string_left) > 0 :
                # f.write("%s | %s\n" % (what_string_left, ing))

                # print("comm: ", ing_expanded)
                # print("left: ", what_string_left.encode("utf-8"), "\n")

            # if what_string_left:
            #
            #     print(what_string_left)
            #     ing = what_string_left

            else:
                #removing text in brackets
                ing = re.sub("\s?[\(\[].*?[\)\]]", "", ing)

                #removing anything extra in the ingredient discription (after ',')
                if "," in ing:
                    clean_ing, extra = ing.split(',', 1)
                    ing = clean_ing

                #removing stopwords
                filtered_ing = []
                for w in ing.split():
                    if w not in self.stop_words:
                        filtered_ing.append(w)

                #stemming
                clean_ing = []
                for word in filtered_ing:
                    # clean_ing.append(word)
                    clean_ing.append(
                        self.ps.stem(word))  ####dissabled stemming

                #lower caseing & appending
                if len(clean_ing) > 0:
                    list_of_ing.append(' '.join(clean_ing).lower())
                    # print("othe: ", ' '.join(clean_ing).lower(), "\n")

        final_list = []

        # print("common list: ", common_ing_list)
        # print("uncomm list: ", list_of_ing, "\n")

        final_set = set(common_ing_list + list_of_ing)

        for item in final_set:
            final_list.append(item)

        # f.close()
        return final_list

    def recipe_class(self, title, discription_text):
        #discription words
        discript = title + " " + discription_text
        discription = re.sub(
            r'[^\w\s]', ' ',
            discript).lower()  #removes punctuation, lower case

        labels = []

        #if the discription contains a specified class
        for label in self.recipe_classes:
            if label in discription:

                #if middle eastern (do not classify as easter)
                if 'middle eastern' in labels:
                    if label == 'easter':
                        continue
                else:
                    labels.append(label)

        return labels

    def clean_time(self, string, url):
        d_match = re.findall(r"([0-9]+\s?[d]+)", string, re.I)
        h_match = re.findall(r"([0-9]+\s?[h]+)", string, re.I)
        m_match = re.findall(r"([0-9]+\s?[m]+)", string, re.I)
        to_match = re.findall(r"([0-9]+\s[t][o]\s+[0-9])", string,
                              re.I)  #eg. "1 to 2 hours"

        days = 0
        hours = 0
        minutes = 0

        #if contains a dash
        if "-" in string:
            string = string.split('-', 1)[0]

        #key phrase search
        if "to" in string:
            if to_match:
                string = string.split("to")[1]
            else:
                string = string.split("to")[0]

        #if contains days
        if d_match:
            days = int(re.findall(r'\d+', str(d_match))[0])

        #if contains hours
        if h_match:
            hours = int(re.findall(r'\d+', str(h_match))[0])

        #if contains min
        if m_match:
            minutes = int(re.findall(r'\d+', str(m_match))[0])

        #if does not contains min or hours
        if not d_match and not h_match and not m_match:

            if "overnight" in string:
                hours = 8

            elif "no cooking required" in string.lower():
                hours = 0

            elif string.isdigit():
                minutes = int(string)

            elif "0s" in string.lower():
                hours = 10  ### Error in parsing recipe given large prep time

            elif "none" in string.lower():
                hours = 0

            elif "=" in string.lower():
                hours = 10  ### Error in parsing recipe given large prep time

            else:
                print("1. TIME INPUT ERROR: ", string.encode("utf-8"),
                      " URL: ", url)
                return "skip this recipe"

        return days * 1440 + hours * 60 + minutes

    def ParseRecipeFile(self):
        line_num = 1
        ing_num = 0
        ingredients = []
        description = ""

        for line in self.recipeFileHandle.readlines():
            if line_num == 1:
                url = line
                #print("URL: ", url)
                self.recipe_id += 1
                line_num += 1
                if self.recipe_id % 100 == 0:
                    print("Processed %d recipes." % self.recipe_id)
                continue

            if line_num == 2:
                img_url = line
                #print("image URL: ", img_url)
                line_num += 1
                continue

            if line_num == 3:
                title = line
                #print("recipe title: ", title)
                line_num += 1
                continue

            if line_num == 4:
                if "preptime" in line:
                    #printing discription in a single line
                    #print("discription: ", re.sub("\n", "", description))
                    line_num += 1
                    continue
                else:
                    description = description + " " + line
                    continue

            if line_num == 5:
                prep_time = self.clean_time(line, url)
                #print("\nprep time: ", prep_time, " min.\n")
                line_num += 1
                continue

            if line_num == 6:
                cook_time = self.clean_time(line, url)
                #print("cook time: ", cook_time, " min.\n")
                line_num += 1
                continue

            if line_num == 7:
                try:
                    servings = int(re.findall(r'\d+', line)[0])
                except:
                    servings = 1  #e.g. "Makes one jar"

                #print("servings: ", servings, "\n")
                line_num += 1
                continue

            if "*****eol*****" in line:

                #if the recipe has parsing issues
                if cook_time == "skip this recipe" or prep_time == "skip this recipe":
                    line_num = 1
                    ing_num = 0
                    ingredients = []
                    description = ""
                    continue

                # print(str(ingredients).encode("utf-8"))
                clean_ingredients = self.cleaning_ing_list(
                    ingredients)  #clean list of ingredients
                labels = self.recipe_class(
                    title, description)  #label based on title and discription

                #label exists add the class as an ingredient
                if labels:
                    clean_ingredients = clean_ingredients + labels

                # print(str(clean_ingredients).encode("utf-8"))

                #print("ingredients: ", clean_ingredients)
                #print("\nnum of ingredients: ", ing_num)
                #print("-----")

                #store values in database
                for ingredient in clean_ingredients:
                    self.database.AddToIngredientIndexTable(
                        ingredient, self.recipe_id)

                self.database.AddToRecipeInfoTable(self.recipe_id, url,
                                                   img_url, title, description,
                                                   prep_time, cook_time,
                                                   servings,
                                                   len(clean_ingredients))

                line_num = 1
                ing_num = 0
                ingredients = []
                description = ""

                continue

            if line_num > 7:
                #print(line)

                #words of an ingridient entry
                line_words = line.split()

                #key words search
                if "and" in line_words:

                    try:
                        primary_ing, extra_ing = line.split(" and ", 1)
                        ingredients.extend([primary_ing, extra_ing])
                        ing_num += 1  #count extra ing

                    except:
                        ingredients.append(line)
                        print("1. ERROR in AND: ", line.encode("utf-8"))

                elif "and/or" in line_words:
                    try:
                        primary_ing, alt_ing = line.split(" and/or ", 1)
                        ingredients.extend([primary_ing, alt_ing])
                        #not counting alternative ingridient
                    except:
                        ingredients.append(line)
                        print("error in AND/OR: ", line.encode("utf-8"))

                elif "plus extra" in line:
                    try:
                        primary_ing, extra_same = line.split("plus extra", 1)
                        ingredients.append(primary_ing)
                        #excluding since the same ingridient is used
                    except:
                        ingredients.append(line)
                        print("plus extra error line: ", line.encode("utf-8"))

                elif "plus" in line_words:
                    try:
                        primary_ing, extra_ing = line.split(" plus ", 1)
                        ingredients.extend([primary_ing, extra_ing])
                        ing_num += 1  #count extra ing
                    except:
                        ingredients.append(line)
                        print("plus error line: ", line.encode("utf-8"))

                #dure to the similarity in spelling for and or are special case
                elif "or" in line_words:

                    #looking at the possitions of word in line
                    if "for" in line_words:
                        pos_for = line.find(" for ")
                        pos_or = line.find(" or ")

                        if pos_for > pos_or:
                            try:
                                primary_ing, alternative = line.split(
                                    " or ", 1)
                                alt_ing, why_needed = alternative.split(
                                    "for ", 1)
                                ingredients.extend([primary_ing, alt_ing])
                                #not counting alternative ingridient
                            except:
                                ingredients.append(line)
                                print("error in OR and FOR: ",
                                      line.encode("utf-8"))

                        else:
                            try:
                                primary_ing, why_needed = line.split(
                                    " for ", 1)
                                ingredients.append(primary_ing)
                                #excluding why ingredient is needed
                            except:
                                ingredients.append(line)
                                print("error in FOR: ", line.encode("utf-8"))

                    else:
                        try:
                            primary_ing, alt_ing = line.split(" or ", 1)
                            ingredients.extend([primary_ing, alt_ing])
                            #not counting alternative ingridient
                        except:
                            ingredients.append(line)
                            print("error in OR: ", line.encode("utf-8"))

                else:
                    ingredients.append(line)

                ing_num += 1
                line_num += 1
                continue

            else:
                print("error", line.text.encode("utf-8"))

    def ParseRecipeFiles(self):
        for recipeFile in self.recipeFiles:
            self.recipeFileHandle = open(recipeFile, 'r', encoding="utf-8")
            self.ParseRecipeFile()
            self.recipeFileHandle.close()
Ejemplo n.º 2
0
from Database import *
from VariableByteEncoder import *

database = Database("recipes")

print("Adding ingredients and recipe IDs to database.")
database.AddToIngredientIndexTable("olive oil", 100)
database.AddToIngredientIndexTable("chicken",   100)
database.AddToIngredientIndexTable("spinach",   100)
database.AddToIngredientIndexTable("fish",      200)
database.AddToIngredientIndexTable("chicken",   200)
database.AddToIngredientIndexTable("chicken",   300)
database.AddToIngredientIndexTable("spinach",   300)
database.AddToIngredientIndexTable("corn oil",  400)
database.AddToIngredientIndexTable("corn oil",  200)
database.AddToIngredientIndexTable("oil",       500)
database.AddToIngredientIndexTable("oil",       200)

print("Retrieving recipe IDs from database.")
recipeIds = database.GetRecipeIds("olive oil")
print(recipeIds)
recipeIds = database.GetRecipeIds("chicken")
print(recipeIds)
recipeIds = database.GetRecipeIds("spinach")
print(recipeIds)
recipeIds = database.GetRecipeIds("fish")
print(recipeIds)
recipeIds = database.GetRecipeIds("corn oil")
print(recipeIds)
recipeIds = database.GetRecipeIds("oil")
print(recipeIds)