class RecipeParser: recipeFiles = [ "recepie_project/ttds-project-bbc/content.txt", "recepie_project_all_recipes/ttds-project/contents.txt", "recepie_project_epicurious/ttds-project/contents.txt", "recepie_project_food/ttds-project/contents.txt", "recipe_myrecipes.com_project/ttds-project/myrecipes_content.txt" ] # recipeFiles = ["test_database/content_all_recipes.txt", # "test_database/content_bbc.txt", # "test_database/content_epicurious.txt", # "test_database/content_food.txt", # "test_database/content_my_recipes.txt"] # recipeFiles = ["continue_to_add_from_here/food_contents.txt", # "continue_to_add_from_here/myrecipes_content.txt"] recipeFileHandle = None database = None ps = None stop_words = None recipe_classes = None recipe_id = 0 def __init__(self, recipe_id=0): if recipe_id != 0: self.database = Database("recipes", append=True) else: self.database = Database("recipes") self.recipe_id = recipe_id self.ps = PorterStemmer() self.stop_words = self.GetStopWords() #discriptive words self.recipe_classes = self.GetRecipeClasses() #lables self.common_ing = self.GetCommonIng( ) #common ingredients that can be embedded e.g. "anise-flavored" self.common_ing_f = self.GetCommonIngFullWords( ) #words that need to mached fully e.g. "tea" def GetStopWords(self): return [ stop_words.rstrip('\n') for stop_words in open( "recipe_stopwords.txt", 'r', encoding="utf8") ] def GetRecipeClasses(self): return [ recipe_classes.rstrip('\n') for recipe_classes in open("classes.txt", 'r', encoding="utf-8") ] def GetCommonIng(self): return [ recipe_classes.rstrip('\n') for recipe_classes in open("known_ing.txt", 'r', encoding="utf-8") ] def GetCommonIngFullWords(self): return [ recipe_classes.rstrip('\n') for recipe_classes in open( "known_ing_full.txt", 'r', encoding="utf-8") ] def cleaning_ing_list(self, ingredients): list_of_ing = [] # f = open("what_is_left.txt","a+") common_ing_list = [] for ing in ingredients: #ing only words pattern = re.compile(r"\b[a-zA-Z-]+\b") ing_words = pattern.findall(ing) #removing ingredients that match mistakenly to other things short_words_present = [] for w in ing_words: if w in self.common_ing_f: short_words_present.append(self.ps.stem(w)) ing_words.remove(w) # print("SHORT W: ", w) ing = (' '.join(ing_words).lower()) # print("ingi: ", ing) #looking for ingredients that are commonly used common_ing_present = [] ing_copy = ing what_string_left = "" for common_ing in self.common_ing: if common_ing in ing_copy: ing_copy = ing_copy.replace(common_ing, ' ') common_ing_present.append(self.ps.stem(common_ing)) what_string_left = ing_copy #if common ingredients list not empty if common_ing_present or short_words_present: ing_expanded = (' '.join(short_words_present + common_ing_present)) common_ing_list.extend(short_words_present + common_ing_present) common_ing_list.append(ing_expanded) # if len(what_string_left) > 0 : # f.write("%s | %s\n" % (what_string_left, ing)) # print("comm: ", ing_expanded) # print("left: ", what_string_left.encode("utf-8"), "\n") # if what_string_left: # # print(what_string_left) # ing = what_string_left else: #removing text in brackets ing = re.sub("\s?[\(\[].*?[\)\]]", "", ing) #removing anything extra in the ingredient discription (after ',') if "," in ing: clean_ing, extra = ing.split(',', 1) ing = clean_ing #removing stopwords filtered_ing = [] for w in ing.split(): if w not in self.stop_words: filtered_ing.append(w) #stemming clean_ing = [] for word in filtered_ing: # clean_ing.append(word) clean_ing.append( self.ps.stem(word)) ####dissabled stemming #lower caseing & appending if len(clean_ing) > 0: list_of_ing.append(' '.join(clean_ing).lower()) # print("othe: ", ' '.join(clean_ing).lower(), "\n") final_list = [] # print("common list: ", common_ing_list) # print("uncomm list: ", list_of_ing, "\n") final_set = set(common_ing_list + list_of_ing) for item in final_set: final_list.append(item) # f.close() return final_list def recipe_class(self, title, discription_text): #discription words discript = title + " " + discription_text discription = re.sub( r'[^\w\s]', ' ', discript).lower() #removes punctuation, lower case labels = [] #if the discription contains a specified class for label in self.recipe_classes: if label in discription: #if middle eastern (do not classify as easter) if 'middle eastern' in labels: if label == 'easter': continue else: labels.append(label) return labels def clean_time(self, string, url): d_match = re.findall(r"([0-9]+\s?[d]+)", string, re.I) h_match = re.findall(r"([0-9]+\s?[h]+)", string, re.I) m_match = re.findall(r"([0-9]+\s?[m]+)", string, re.I) to_match = re.findall(r"([0-9]+\s[t][o]\s+[0-9])", string, re.I) #eg. "1 to 2 hours" days = 0 hours = 0 minutes = 0 #if contains a dash if "-" in string: string = string.split('-', 1)[0] #key phrase search if "to" in string: if to_match: string = string.split("to")[1] else: string = string.split("to")[0] #if contains days if d_match: days = int(re.findall(r'\d+', str(d_match))[0]) #if contains hours if h_match: hours = int(re.findall(r'\d+', str(h_match))[0]) #if contains min if m_match: minutes = int(re.findall(r'\d+', str(m_match))[0]) #if does not contains min or hours if not d_match and not h_match and not m_match: if "overnight" in string: hours = 8 elif "no cooking required" in string.lower(): hours = 0 elif string.isdigit(): minutes = int(string) elif "0s" in string.lower(): hours = 10 ### Error in parsing recipe given large prep time elif "none" in string.lower(): hours = 0 elif "=" in string.lower(): hours = 10 ### Error in parsing recipe given large prep time else: print("1. TIME INPUT ERROR: ", string.encode("utf-8"), " URL: ", url) return "skip this recipe" return days * 1440 + hours * 60 + minutes def ParseRecipeFile(self): line_num = 1 ing_num = 0 ingredients = [] description = "" for line in self.recipeFileHandle.readlines(): if line_num == 1: url = line #print("URL: ", url) self.recipe_id += 1 line_num += 1 if self.recipe_id % 100 == 0: print("Processed %d recipes." % self.recipe_id) continue if line_num == 2: img_url = line #print("image URL: ", img_url) line_num += 1 continue if line_num == 3: title = line #print("recipe title: ", title) line_num += 1 continue if line_num == 4: if "preptime" in line: #printing discription in a single line #print("discription: ", re.sub("\n", "", description)) line_num += 1 continue else: description = description + " " + line continue if line_num == 5: prep_time = self.clean_time(line, url) #print("\nprep time: ", prep_time, " min.\n") line_num += 1 continue if line_num == 6: cook_time = self.clean_time(line, url) #print("cook time: ", cook_time, " min.\n") line_num += 1 continue if line_num == 7: try: servings = int(re.findall(r'\d+', line)[0]) except: servings = 1 #e.g. "Makes one jar" #print("servings: ", servings, "\n") line_num += 1 continue if "*****eol*****" in line: #if the recipe has parsing issues if cook_time == "skip this recipe" or prep_time == "skip this recipe": line_num = 1 ing_num = 0 ingredients = [] description = "" continue # print(str(ingredients).encode("utf-8")) clean_ingredients = self.cleaning_ing_list( ingredients) #clean list of ingredients labels = self.recipe_class( title, description) #label based on title and discription #label exists add the class as an ingredient if labels: clean_ingredients = clean_ingredients + labels # print(str(clean_ingredients).encode("utf-8")) #print("ingredients: ", clean_ingredients) #print("\nnum of ingredients: ", ing_num) #print("-----") #store values in database for ingredient in clean_ingredients: self.database.AddToIngredientIndexTable( ingredient, self.recipe_id) self.database.AddToRecipeInfoTable(self.recipe_id, url, img_url, title, description, prep_time, cook_time, servings, len(clean_ingredients)) line_num = 1 ing_num = 0 ingredients = [] description = "" continue if line_num > 7: #print(line) #words of an ingridient entry line_words = line.split() #key words search if "and" in line_words: try: primary_ing, extra_ing = line.split(" and ", 1) ingredients.extend([primary_ing, extra_ing]) ing_num += 1 #count extra ing except: ingredients.append(line) print("1. ERROR in AND: ", line.encode("utf-8")) elif "and/or" in line_words: try: primary_ing, alt_ing = line.split(" and/or ", 1) ingredients.extend([primary_ing, alt_ing]) #not counting alternative ingridient except: ingredients.append(line) print("error in AND/OR: ", line.encode("utf-8")) elif "plus extra" in line: try: primary_ing, extra_same = line.split("plus extra", 1) ingredients.append(primary_ing) #excluding since the same ingridient is used except: ingredients.append(line) print("plus extra error line: ", line.encode("utf-8")) elif "plus" in line_words: try: primary_ing, extra_ing = line.split(" plus ", 1) ingredients.extend([primary_ing, extra_ing]) ing_num += 1 #count extra ing except: ingredients.append(line) print("plus error line: ", line.encode("utf-8")) #dure to the similarity in spelling for and or are special case elif "or" in line_words: #looking at the possitions of word in line if "for" in line_words: pos_for = line.find(" for ") pos_or = line.find(" or ") if pos_for > pos_or: try: primary_ing, alternative = line.split( " or ", 1) alt_ing, why_needed = alternative.split( "for ", 1) ingredients.extend([primary_ing, alt_ing]) #not counting alternative ingridient except: ingredients.append(line) print("error in OR and FOR: ", line.encode("utf-8")) else: try: primary_ing, why_needed = line.split( " for ", 1) ingredients.append(primary_ing) #excluding why ingredient is needed except: ingredients.append(line) print("error in FOR: ", line.encode("utf-8")) else: try: primary_ing, alt_ing = line.split(" or ", 1) ingredients.extend([primary_ing, alt_ing]) #not counting alternative ingridient except: ingredients.append(line) print("error in OR: ", line.encode("utf-8")) else: ingredients.append(line) ing_num += 1 line_num += 1 continue else: print("error", line.text.encode("utf-8")) def ParseRecipeFiles(self): for recipeFile in self.recipeFiles: self.recipeFileHandle = open(recipeFile, 'r', encoding="utf-8") self.ParseRecipeFile() self.recipeFileHandle.close()
from Database import * from VariableByteEncoder import * database = Database("recipes") print("Adding ingredients and recipe IDs to database.") database.AddToIngredientIndexTable("olive oil", 100) database.AddToIngredientIndexTable("chicken", 100) database.AddToIngredientIndexTable("spinach", 100) database.AddToIngredientIndexTable("fish", 200) database.AddToIngredientIndexTable("chicken", 200) database.AddToIngredientIndexTable("chicken", 300) database.AddToIngredientIndexTable("spinach", 300) database.AddToIngredientIndexTable("corn oil", 400) database.AddToIngredientIndexTable("corn oil", 200) database.AddToIngredientIndexTable("oil", 500) database.AddToIngredientIndexTable("oil", 200) print("Retrieving recipe IDs from database.") recipeIds = database.GetRecipeIds("olive oil") print(recipeIds) recipeIds = database.GetRecipeIds("chicken") print(recipeIds) recipeIds = database.GetRecipeIds("spinach") print(recipeIds) recipeIds = database.GetRecipeIds("fish") print(recipeIds) recipeIds = database.GetRecipeIds("corn oil") print(recipeIds) recipeIds = database.GetRecipeIds("oil") print(recipeIds)