def parseLink(link, restaurant): url = urllib2.urlopen(link).read() soup = BeautifulSoup(url) #check certain conditions in the link to see what type of item it is. Then use beautiful soup to parse the nutrition facts and fill the database appropriately. newItem = Item() newItem.name = soup.title.text newItem.restaurant = restaurant newItem.meal = "any" #error handling: in case of no info available on website if soup.find(class_="nfbox"): newItem.calories = checkCal(soup.find(text="Calories")) newItem.fat_calories = checkNF(re.search("\d+\.?\d*", soup.find(class_="nffatcal").text)) newItem.total_fat = checkNF(re.search("\d+\.?\d*", soup.find(text="Total Fat").next_element)) newItem.total_fat_units = re.search("[m]?[g]", soup.find(text="Total Fat").next_element).group() newItem.total_fat_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Total Fat").next_element.next_element.text)) newItem.saturated_fat = checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Saturated Fat)")))) newItem.saturated_fat_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Saturated Fat)"))).group() newItem.saturated_fat_dv = checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Saturated Fat)")).next_element.text)) newItem.trans_fat = checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Trans Fat)")))) newItem.trans_fat_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Trans Fat)"))).group() newItem.cholesterol = checkNF(re.search("\d+\.?\d*", soup.find(text="Cholesterol").next_element)) newItem.cholesterol_units = re.search("[m]?[g]", soup.find(text="Cholesterol").next_element).group() newItem.cholesterol_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Cholesterol").next_element.next_element.text)) newItem.sodium = checkNF(re.search("\d+\.?\d*", soup.find(text="Sodium").next_element)) newItem.sodium_units = re.search("[m]?[g]", soup.find(text="Sodium").next_element).group() newItem.sodium_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Sodium").next_element.next_element.text)) newItem.total_carbs = checkNF(re.search("\d+\.?\d*", soup.find(text="Total Carbohydrate").next_element)) newItem.total_carbs_units = re.search("[m]?[g]", soup.find(text="Total Carbohydrate").next_element).group() newItem.total_carbs_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Total Carbohydrate").next_element.next_element.text)) newItem.dietary_fiber = checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Dietary Fiber)")))) newItem.dietary_fiber_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Dietary Fiber)"))).group() newItem.dietary_fiber_dv = checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Dietary Fiber)")).next_element.text)) newItem.sugars = checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Sugars)")))) newItem.sugars_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Sugars)"))).group() newItem.protein = checkNF(re.search("\d+\.?\d*", soup.find(text="Protein").next_element)) newItem.protein_units = re.search("[m]?[g]", soup.find(text="Protein").next_element).group() newItem.vitamin_A_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Vitamin A").next_element.next_element.text)) newItem.vitamin_C_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Vitamin C").next_element.next_element.text)) newItem.calcium_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Calcium").next_element.next_element.text)) newItem.iron_dv = checkNF(re.search("\d+\.?\d*", soup.find(text="Iron").next_element.next_element.text)) #newItem.save() newItem, created = Item.objects.get_or_create(name=newItem.name, restaurant=newItem.restaurant, meal=newItem.meal, calories=newItem.calories, fat_calories=newItem.fat_calories, total_fat=newItem.total_fat, total_fat_units=newItem.total_fat_units, total_fat_dv=newItem.total_fat_dv, saturated_fat=newItem.saturated_fat, saturated_fat_units=newItem.saturated_fat_units, saturated_fat_dv=newItem.saturated_fat_dv, trans_fat=newItem.trans_fat, trans_fat_units=newItem.trans_fat_units, cholesterol=newItem.cholesterol, cholesterol_units=newItem.cholesterol_units, cholesterol_dv=newItem.cholesterol_dv, sodium=newItem.sodium, sodium_units=newItem.sodium_units, sodium_dv=newItem.sodium_dv, total_carbs=newItem.total_carbs, total_carbs_units=newItem.total_carbs_units, total_carbs_dv=newItem.total_carbs_dv, dietary_fiber=newItem.dietary_fiber, dietary_fiber_units=newItem.dietary_fiber_units, dietary_fiber_dv=newItem.dietary_fiber_dv, sugars=newItem.sugars, sugars_units=newItem.sugars_units, protein=newItem.protein, protein_units=newItem.protein_units, vitamin_A_dv=newItem.vitamin_A_dv, vitamin_C_dv=newItem.vitamin_C_dv, calcium_dv=newItem.calcium_dv, iron_dv=newItem.iron_dv) else: newItem.save() return print soup.title.text print '\n'
def parseLink(self, name, link, meal, category, month, day, year): try: url = urllib2.urlopen(link) except urllib2.HTTPError: return url = url.read() soup = BeautifulSoup(url) unicode(soup) str(soup) soup.prettify() #check certain conditions in the link to see what type of item it is. Then use beautiful soup to parse the nutrition facts and fill the database appropriately. newItem = Item() newItem.name = name newItem.restaurant = "Dining Hall" if meal == 1: newItem.meal = "Breakfast" elif meal == 2: newItem.meal = "Lunch" elif meal == 3: newItem.meal = "Dinner" newItem.category = category newItem.month = month newItem.day = day newItem.year = year #error handling: in case of no info available on website if soup.find(class_="rderror"): #newItem.save() return newItem.calories = self.checkCal(soup.find(text="Calories")) newItem.fat_calories = self.checkNF(re.search("\d+\.?\d*", soup.find(class_="nffatcal").text)) newItem.total_fat = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Total Fat").next_element)) newItem.total_fat_units = re.search("[m]?[g]", soup.find(text="Total Fat").next_element).group() newItem.total_fat_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Total Fat").next_element.next_element.text)) newItem.saturated_fat = self.checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Saturated Fat)")))) newItem.saturated_fat_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Saturated Fat)"))).group() newItem.saturated_fat_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Saturated Fat)")).next_element.text)) newItem.trans_fat = self.checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Trans Fat)")))) newItem.trans_fat_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Trans Fat)"))).group() newItem.cholesterol = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Cholesterol").next_element)) newItem.cholesterol_units = re.search("[m]?[g]", soup.find(text="Cholesterol").next_element).group() newItem.cholesterol_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Cholesterol").next_element.next_element.text)) newItem.sodium = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Sodium").next_element)) newItem.sodium_units = re.search("[m]?[g]", soup.find(text="Sodium").next_element).group() newItem.sodium_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Sodium").next_element.next_element.text)) newItem.total_carbs = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Total Carbohydrate").next_element)) newItem.total_carbs_units = re.search("[m]?[g]", soup.find(text="Total Carbohydrate").next_element).group() newItem.total_carbs_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Total Carbohydrate").next_element.next_element.text)) newItem.dietary_fiber = self.checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Dietary Fiber)")))) newItem.dietary_fiber_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Dietary Fiber)"))).group() newItem.dietary_fiber_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Dietary Fiber)")).next_element.text)) newItem.sugars = self.checkNF(re.search("\d+\.?\d*", soup.find(text=re.compile("(?=Sugars)")))) newItem.sugars_units = re.search("[m]?[g]", soup.find(text=re.compile("(?=Sugars)"))).group() newItem.protein = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Protein").next_element)) newItem.protein_units = re.search("[m]?[g]", soup.find(text="Protein").next_element).group() newItem.vitamin_A_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Vitamin A").next_element.next_element.text)) newItem.vitamin_C_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Vitamin C").next_element.next_element.text)) newItem.calcium_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Calcium").next_element.next_element.text)) newItem.iron_dv = self.checkNF(re.search("\d+\.?\d*", soup.find(text="Iron").next_element.next_element.text)) #newItem.save() defaults = {'calories' :newItem.calories, 'fat_calories': newItem.fat_calories, 'total_fat': newItem.total_fat, 'total_fat_units': newItem.total_fat_units, 'total_fat_dv': newItem.total_fat_dv, 'saturated_fat': newItem.saturated_fat, 'saturated_fat_units': newItem.saturated_fat_units, 'saturated_fat_dv': newItem.saturated_fat_dv, 'trans_fat': newItem.trans_fat, 'trans_fat_units': newItem.trans_fat_units, 'cholesterol': newItem.cholesterol, 'cholesterol_units': newItem.cholesterol_units, 'cholesterol_dv': newItem.cholesterol_dv, 'sodium': newItem.sodium, 'sodium_units': newItem.sodium_units, 'sodium_dv': newItem.sodium_dv, 'total_carbs': newItem.total_carbs, 'total_carbs_units': newItem.total_carbs_units, 'total_carbs_dv': newItem.total_carbs_dv, 'dietary_fiber': newItem.dietary_fiber, 'dietary_fiber_units': newItem.dietary_fiber_units, 'dietary_fiber_dv': newItem.dietary_fiber_dv, 'sugars': newItem.sugars, 'sugars_units': newItem.sugars_units, 'protein': newItem.protein, 'protein_units': newItem.protein_units, 'vitamin_A_dv': newItem.vitamin_A_dv, 'vitamin_C_dv': newItem.vitamin_C_dv, 'calcium_dv': newItem.calcium_dv, 'iron_dv': newItem.iron_dv} item = Item.objects.get_or_create(name=newItem.name, restaurant=newItem.restaurant, meal=newItem.meal, category=newItem.category, month=newItem.month, day=newItem.day, year=newItem.year, defaults=defaults) self.stdout.write('\n')