def main(): url = START_PAGE while url is not None: soup = utils.getPage(url) for title in soup.findAll('h2', {"class": "entry-title"}): print title.text #getArticle(title['href']) print getNextPageLink(soup) url = getNextPageLink(soup) print "Crawling finished!"
def netCrawler(recipe_id, n): #recipe_id = 0 #for n in range (0, 100): a= 12*n urln = '' + str(a) time.sleep(1)#--# S = utils.getPage(urln) templst = [] for u in S('.result-item.recipe'): templst.append(S(u)) urlist = [] for u in templst: urlist.append('' + u.find('a').attr('href')) print 'net list made ' + str(n) for nextUrl in urlist: time.sleep(1)#--# p = utils.getPage(nextUrl) RecpName = [] for this in p('.fn_name'): RecpName.append(p(this).text()) print RecpName if utils.isDuplicate(RecpName[0]): recipe_id += 1 print 'duplicate' continue ImgURL = '' b = p('img#recipe-player-th') for this in b: ImgURL = p(this).attr('src') #if ImgURL is '' then there is no image given. Happens often. unparsedCtime = '' unparsedPtime = '' c = p('dd.clrfix') for this in c('meta'): if p(this).attr('itemprop') == 'cookTime': unparsedCtime = p(this).attr('content') elif p(this).attr('itemprop') == 'prepTime': unparsedPtime = p(this).attr('content') difficulty = '' difficulty = p(c[-1]).text() Ctime = utils.FN_minutesParser(unparsedCtime) + 60*(utils.FN_hourParser(unparsedCtime)) Ptime = utils.FN_minutesParser(unparsedPtime) + 60*(utils.FN_hourParser(unparsedPtime)) servings = [] for this in c('span'): servings.append(p(this).text()) IngrName = [] d = p('.kv-ingred-list1') #d is now a list of all the ingred lists for this in d('li'): IngrName.append(p(this).text()) #the ingredients here are smashed together with their ammounts, which I can't parse out. # if we can parse things like "1 large tomato, diced in 1/2-inch pieces" into ammounts and # ingredient lists, then someone should teach me. directions = [] e = p('.fn_instructions') for this in e('p'): elf = p(this).text() if elf != '\n': if elf != '': directions.append(elf) if not servings: servings = [''] decript = utils.descriptionize(directions) k = 0 RecipeDict = {} RecipeDict = { 'id': recipe_id, 'name':RecpName[0], 'description' : decript, 'servings' : servings[0], 'url' : nextUrl, 'img_url' : ImgURL, 'difficulty' : difficulty, 'cook_time' : Ctime, 'prep_time' : Ptime} k = utils.insert('recipe', **RecipeDict) #print RecpName #for this in decript: # print this #print servings[0] #print nextUrl #print ImgURL #print Ctime #print Ptime IngrDict = {} IngrAmnt = [] if k != -1: for a in range(len(IngrName)): if len(IngrAmnt) < len(IngrName): IngrAmnt.append('') #in the case where "season to preference" is the 'ingredient' #print len(IngrAmnt) #print IngrName[a] #print IngrAmnt[a] IngrDict = { 'recipe_id':recipe_id, 'ingredient_name':IngrName[a], 'quantity':IngrAmnt[a]} l = utils.insert('recipe_ingredient', **IngrDict) recipe_id += 1 return recipe_id
def getArticle(url): soup = utils.getPage(url) getTags(soup) return
def allCrawler(recipe_id, n): #recipe_id = 10 #for n in range(1, 220): a = n * 17 #grab every 17th page, to provide a wide spread of results urln = '' + str(a) time.sleep(1) #Sleeping is done according to the Web Crawling Standards S = pq(url=urln) S = utils.getPage(urln) templst = [] for u in S('.rectitlediv'): templst.append(S(u)) urlist = [] for u in templst: urlist.append(u.find('a').attr('href')) print 'all list made ' + str(n) for nextUrl in urlist: time.sleep(1) #Sleeping is done according to the Web Crawling Standards p = utils.getPage(nextUrl) #Creation of temp variables RecpName = [] IngrName = [] IngrAmnt = [] Description = [] ImageURL = [] Cookpmin = [] Cookphour = [] Cookcmin = [] Cookchour = [] Cooktmin = [] Cookthour = [] Servings = [] #temp variables filled for this in p('#itemTitle'): RecpName.append(p(this).text()) print RecpName if utils.isDuplicate(RecpName[0]): print 'duplicate' recipe_id += 1 continue for this in p('#lblIngName'): if p(this).text() != '': if p(this).attr('class') == 'ingredient-name': IngrName.append(p(this).text()) for this in p('#lblIngAmount'): IngrAmnt.append(p(this).text()) for this in p('span.plaincharacterwrap.break'): Description.append(p(this).text()) for this in p('#imgPhoto'): ImageURL.append(p(this).attr('src')) for this in p('#prepMinsSpan'): Cookpmin.append(p(this).text()) for this in p('#cookMinsSpan'): Cookcmin.append(p(this).text()) for this in p('#totalMinsSpan'): Cooktmin.append(p(this).text()) for this in p('#prepHoursSpan'): Cookphour.append(p(this).text()) for this in p('#cookHoursSpan'): Cookchour.append(p(this).text()) for this in p('#totalHoursSpan'): Cookthour.append(p(this).text()) for this in p('#lblYield'): Servings.append(p(this).text()) #parse out the decimal times for cooking HpTime = utils.makeDecimalTime(Cookphour) MpTime = utils.makeDecimalTime(Cookpmin) #parse out the decimal times for preping HTime = utils.makeDecimalTime(Cookthour) MTime = utils.makeDecimalTime(Cooktmin) #convert all time into minutes Ctime = HTime*60 + MTime Ptime = HpTime*60 + MpTime #error checking if RecpName: Rname = RecpName[0] else: Rname = 'null' if ImageURL: ImgURL = ImageURL[0] else: ImgURL = 'null' if not Servings: Servings = ['null'] decript = utils.descriptionize(Description) k = 0 RecipeDict = { 'id': recipe_id, 'name':Rname, 'description' : decript, 'servings' : Servings[0], 'url' : nextUrl, 'img_url' : ImgURL, 'cook_time' : Ctime, 'prep_time' : Ptime} k = utils.insert('recipe', **RecipeDict) IngrDict = {} #if len(IngrName) > len(IngrAmnt): #print IngrName #print IngrAmnt if k != -1: for a in range(len(IngrName)): if len(IngrAmnt) < len(IngrName): IngrAmnt.append('') #in the case where "season to preference" is the 'ingredient' #print len(IngrAmnt) IngrDict = { 'recipe_id':recipe_id, 'ingredient_name':IngrName[a], 'quantity':IngrAmnt[a]} l = utils.insert('recipe_ingredient', **IngrDict) recipe_id += 1 return recipe_id
import time import utils #recipe_id = utils.latest_recipe_id abc = ['123','A','B','C','D','E','F','G','H','I','J','K','L', 'M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] a = ['A'] for n in a: urln = '' + str(n) time.sleep(1)#--# S = utils.getPage(urln) templst = [] for u in S('.list'): templst.append(S(u)) urlist = [] #only templst 0 and 1 have the results in themu for u in templst: urlist.append(u.find('a').attr('href')) print 'list made ' + str(n) for nextUrl in urlist: time.sleep(1)#--# p = utils.getPage(nextUrl) RecpName = [] for this in p('#rz-lead'): if p(this).attr('class') == 'fn': RecpName.append(p(this).text()) print RecpName