def scraper(modelNum, sheet, j): baseURL = "http://m.homedepot.com" allHTML = scraperFunks.getHTML( scraperFunks.createURL(modelNum, "http://m.homedepot.com/s/")) allReviewsOnPage = scraperFunks.findAllReviews(allHTML, 'li', 'id', 'reviews') # get the last date scraper was ran from the text file try: file = open("LastRanDate.txt", "r") stringDate = file.read() lastRanDate = (datetime.strptime(stringDate, "%Y-%m-%d")).date() except IOError: print "No file for date previously run" lastRanDate = -1 # Check to see if there are no reviews if (len(allReviewsOnPage) == 0): print "Could not find product with model number " + modelNum return for rev in allReviewsOnPage: if (rev.find(string="No Reviews")): print("No Reviews for Model Number " + modelNum) return "No Reviews" reviewsPage = scraperFunks.findNumberOfReviewsText( allReviewsOnPage, 'a', 'class', 'text-secondary flex space-between flex-grow-1') # Check for only one review if (reviewsPage == []): print("Only one review") return "Only one review" numReviews = scraperFunks.getNumReviews(reviewsPage, 'span', 'class', 'text-primary') # determine number of pages of reviews with 10 reviews per page numPages = int(math.floor((numReviews / 10))) mod = (numReviews % 10) if mod > 0: numPages = numPages + 1 link = reviewsPage.get('href') link = link.replace("sort=Most-helpful", "sort=Oldest") if (lastRanDate == -1): i = 0 # Current page count = 0 while i < numPages: # get page of reviews pageOfReviewsHtml = scraperFunks.getHTML( scraperFunks.createURL(link, baseURL)) # find all reviews on current page allReviewsOnPage = scraperFunks.findAllReviews( pageOfReviewsHtml, 'div', 'class', 'reviews-entry p-top-normal p-bottom-normal sborder border-bottom border-default review static-height' ) # Write reviews from page into excel sheet for review in allReviewsOnPage: count += 1 sheet.write(j, 0, modelNum) starRating = str((review.find('div', {'class': 'stars'})).get('rel')) sheet.write(j, 2, starRating) reviewText = (review.find( 'p', {'class': 'review line-height-more'})).string sheet.write(j, 3, reviewText) reviewDate = (review.find( 'div', {'class': 'small text-muted right'})).string sheet.write(j, 1, reviewDate) helpful = (review.findAll('span', {'class': 'small m-left-small'})) # Helpful can sometimes not exist, check for that. if helpful is None: j = j + 1 continue else: # There can be multiple spans with same class. First is something like "Pro" or "DIY" or "Reccommended" # Second is "x in y found this helpful". We want that one. k = 0 wasHelpful = False for item in helpful: if "helpful" in item.string: helpful = item helpful = helpful.string wasHelpful = True break if wasHelpful: sheet.write(j, 4, helpful) # print ("Review: " + str(j) + "\nHelpful Msg: " + helpful) else: sheet.write(j, 4, "Not Helpful") # print ("Review: " + str(j) + " was not helpful.") response = "" j = j + 1 if i != (numPages - 1): paginationSoup = pageOfReviewsHtml.find_all( 'ul', {'class': 'pagination'}) allAElements = paginationSoup[0].find_all('a') link = allAElements[len(allAElements) - 1].get('href') i = i + 1 else: i = numPages # last page of reviews count = 0 while i > 0: # change link to last page of reviews pageString = "/" + str(i) + "?" link = link.replace("/1?", pageString) # get page of reviews pageOfReviewsHtml = scraperFunks.getHTML( scraperFunks.createURL(link, baseURL)) # find all reviews on current page allReviewsOnPage = scraperFunks.findAllReviews( pageOfReviewsHtml, 'div', 'class', 'reviews-entry p-top-normal p-bottom-normal sborder border-bottom border-default review static-height' ) numOfReviews = len(allReviewsOnPage) - 1 noMoreNewReviews = False while numOfReviews >= 0: reviewDate = (allReviewsOnPage[numOfReviews].find( 'div', {'class': 'small text-muted right'})).string reviewDateTime = (datetime.strptime(reviewDate, "%B %d, %Y")).date() # check if it is a new review if reviewDateTime <= lastRanDate: noMoreNewReviews = True break count += 1 sheet.write(j, 0, modelNum) starRating = str((allReviewsOnPage[numOfReviews].find( 'div', {'class': 'stars'})).get('rel')) sheet.write(j, 2, starRating) reviewText = (allReviewsOnPage[numOfReviews].find( 'p', {'class': 'review line-height-more'})).string sheet.write(j, 3, reviewText) sheet.write(j, 1, reviewDate) helpful = (allReviewsOnPage[numOfReviews].findAll( 'span', {'class': 'small m-left-small'})) # Helpful can sometimes not exist, check for that. if helpful is None: j = j + 1 numOfReviews = numOfReviews - 1 continue else: # There can be multiple spans with same class. First is something like "Pro" or "DIY" or "Reccommended" # Second is "x in y found this helpful". We want that one. k = 0 wasHelpful = False for item in helpful: if "helpful" in item.string: helpful = item helpful = helpful.string wasHelpful = True break if wasHelpful: sheet.write(j, 4, helpful) #print ("Review: " + str(j) + "\nHelpful Msg: " + helpful) else: sheet.write(j, 4, "Not Helpful") #print ("Review: " + str(j) + " was not helpful.") response = "" j = j + 1 numOfReviews = numOfReviews - 1 if noMoreNewReviews: break if i > 2: paginationSoup = pageOfReviewsHtml.find_all( 'ul', {'class': 'pagination'}) allAElements = paginationSoup[0].find_all('a') link = allAElements[0].get('href') i = i - 1 date = (datetime.today()).date() file = open("LastRanDate.txt", "w") file.write(str(date)) file.close() print("Number of Reviews " + str(count) + " for productID " + str(modelNum)) return j
def scrapePage(productID, pageNumber, csvFile): # Make TCP Request request = urllib.urlopen(ROOTURL1 + productID + ROOTURL2 + str(pageNumber) + ROOTURL3) # Find where our reviews are located in BazaarVoice JavaScript count = 1 for line in request.readlines(): if count == 9: index = line.index("SourceID") string = line[index + 11:] string = string[:len(string) - 6] count += 1 # Remove bad characters string = string.replace("<br />", "\t") string = string.replace("\\/", "/") string = string.replace("\\\"", "") string = string.replace( chr(ord(u'\xbd')), "") # found this odd character "HALFWIDTH HANGUL LETTER PHIEUPH" string = string.replace(chr(ord(u'\xc2')), "") # This one looks like a T. # Make soup with a string that is clean of bad characters soup = BeautifulSoup(string, "html.parser") prettyHTML = soup.prettify() # for some reason, missing an apostrophe in the wrong place did some weird things. # lets take care of that. # [WARNING]: prettyHTML will have extra stuff that isn't in soup. prettyHTML = prettyHTML.replace("\'http:=\"\"", "\'http:=\"\"\'") # Make BETTER soup with better html formatting soup = BeautifulSoup(prettyHTML, "html.parser") # Find all the reviews (should be about 30-ish). allReviews = scraperFunks.findAllReviews(soup, "div", "id", "BVSubmissionPopupContainer") j = 1 for review in allReviews: print("Review Number: " + str(j)) reviewDate = review.find('meta', {'itemprop': 'datePublished'}) date = reviewDate['content'][:-1] print("Date: " + reviewDate['content']) nickname = review.find('span', {'itemprop': 'author'}) print("Username: "******"Review Title: " + reviewTitle.string.strip()) reviewScore = review.find('span', {'itemprop': 'ratingValue'}) print("Review Score: " + reviewScore.string.strip()) reviewTextSpans = review.find('div', { 'itemprop': 'description' }).findAll('span', {'class': 'BVRRReviewText'}) print("Review Text: ") reviewText = "" for paragraph in reviewTextSpans: print(paragraph.string.strip()) reviewText = reviewText + paragraph.string.strip() + "\t" yesHelpful = review.find('a', { 'title': 'helpful' }).find('span', {'class': 'BVDINumber'}) print("How many people found this Helpful: " + yesHelpful.string.strip()) notHelpful = review.find('a', { 'title': 'unhelpful' }).find('span', {'class': 'BVDINumber'}) print("How many people found this Not Helpful: " + notHelpful.string.strip()) print("\n") csvFile.writerow((j, productID, date, reviewScore.string.strip(), reviewText.strip(), yesHelpful.string.strip(), notHelpful.string.strip())) j = j + 1
def scrapePage(productID, pageNumber, csvFile): global j # Make TCP Request request = urllib.urlopen(ROOTURL1 + productID + ROOTURL2 + str(pageNumber) + ROOTURL3) # removes any output that cannot be processed by BeautifulSoup string = scraperFunks.prepareHTML(request) # Make soup with a string that is clean of bad characters soup = BeautifulSoup(string, "html.parser") prettyHTML = soup.prettify() out = open("test2.txt", "w") out.write(prettyHTML) out.close() # for some reason, missing an apostrophe in the wrong place did some weird things. # lets take care of that. # [WARNING]: prettyHTML will have extra stuff that isn't in soup. prettyHTML = prettyHTML.replace("\'http:=\"\"", "\'http:=\"\"\'") # Make BETTER soup with better html formatting soup = BeautifulSoup(prettyHTML, "html.parser") #find the total number of reviews if (pageNumber == 1): numReviews = soup.find('div', {'class', 'BVRRHistogramTitle'}).find( 'span', {'class', 'BVRRNumber'}).string.strip() print("Total Number of Reviews: " + numReviews) if (numReviews <= 8): pages = 1 else: pages = (math.floor((int(numReviews) - 8) / 30)) + 2 pages = int(pages) global totalPages totalPages = pages # Find all the reviews (should be about 30-ish). allReviews = scraperFunks.findAllReviews(soup, "div", "class", "BVRRContentReview") # parse through each review on page for review in allReviews: revID = review['id'][27:] print("ReviewID: " + revID) #check the date of the review and compare it to reviewDate = review.find('meta', {'itemprop': 'datePublished'}) #Format date output dateString = reviewDate['content'][:-1] print("Date: " + dateString) if (dateObj != -1): reviewDateObj = datetime.strptime(dateString, "%Y-%m-%d") if (reviewDateObj < dateObj): return False #the review was outside the date range #Output for debugging purposes print("Review Number: " + str(j)) #Extra Field that is not in current output (moved to Cycle 2) nicknames = review.find('span', {'itemprop': 'author'}) username = nicknames.get_text().strip() print("Username: "******"Review Title: " + reviewTitle) EoI = "" quality = "" value = "" subRatingHolder = review.findAll('img', {'class': 'BVImgOrSprite'}) if (len(subRatingHolder) > 1): EoI = subRatingHolder[1]['title'] quality = subRatingHolder[2]['title'] value = subRatingHolder[3]['title'] print("EoI: " + EoI) print("Q: " + quality) print("V: " + value) # Gets review score reviewScore = review.find('span', {'itemprop': 'ratingValue'}) print("Review Score: " + reviewScore.string.strip()) # Gets review text reviewTextSpans = review.find('div', { 'itemprop': 'description' }).findAll('span', {'class': 'BVRRReviewText'}) print("Review Text: ") # Reviews can be separated into multiple <spans> if the review used newline, put them into one string for output reviewText = "" for paragraph in reviewTextSpans: reviewText = reviewText + paragraph.string.strip() + "\t" # Remove any special characters that cannot be saved into csv file (non ascii (128)) reviewText = reviewText.strip().encode("ascii", "ignore") print(reviewText) # Gets the number of people who marked the review as helpful (if none recorded outputs 0) yesHelpful = review.find('a', {'title': 'helpful'}) if (yesHelpful is not None): yesHelpful = yesHelpful.find('span', { 'class': 'BVDINumber' }).string.strip() print("How many people found this Helpful: " + yesHelpful) else: yesHelpful = "0" # Gets the number of people who marked the review as not helpful (if none recorded outputs 0) notHelpful = review.find('a', {'title': 'unhelpful'}) if (notHelpful is not None): notHelpful = notHelpful.find('span', { 'class': 'BVDINumber' }).string.strip() print("How many people found this Not Helpful: " + notHelpful) else: notHelpful = "0" # Get the Response from Rheem if there is one responseClass = review.find('div', {'class': 'BVRRReviewClientResponseText'}) responseText = "" if (responseClass is not None): response = review.find('span', {'class': 'BVRRPlainTextMarkup'}) responseText = response.get_text().strip().encode( "ascii", "ignore") else: responseText = "No Response" print("Response from Rheem: " + responseText) #Get Recommended Product True or False recommended = review.find( 'div', {'class': 'BVRRReviewDisplayStyle5AdditionalWrapper'}) recommended = recommended.find('div', {'class': 'BVRRRecommendedContainer'}) isRecommended = True try: recommended = recommended.a.extract().get_text() except: isRecommended = False print("Is this product recommended? " + str(isRecommended)) #Get media indicator hasMedia = review.find('div', {'class': 'BVRRReviewDisplayStyle5Media'}) isMedia = False if (hasMedia is not None): isMedia = True print("Does this product have photo/video? " + str(isMedia)) link = review.find('div', {'class': 'BVRRUserNicknameContainer'}) reviewLink = "" try: link = link.a.extract() reviewLink = link['href'] except: reviewLink = "http://www.homedepot.com/s/" + str(productID) print("Link to Review " + reviewLink) print("\n") #Write all review fields to csv file in the correct order (must match connection profile mapping!) csvFile.writerow( (j, revID, productID, username, reviewTitle, dateString, reviewScore.string.strip(), EoI, quality, value, reviewText, yesHelpful, notHelpful, responseText, str(isRecommended), str(isMedia), reviewLink)) j = j + 1 #increments the review key counter return True #the review was inside the date range
def scrapePage(productID, pageNumber, csvFile): # Make TCP Request request = urllib.urlopen(ROOTURL1 + productID + ROOTURL2 + str(pageNumber) + ROOTURL3) string = scraperFunks.prepareHTML(request) # Make soup with a string that is clean of bad characters soup = BeautifulSoup(string, "html.parser") prettyHTML = soup.prettify() # for some reason, missing an apostrophe in the wrong place did some weird things. # lets take care of that. # [WARNING]: prettyHTML will have extra stuff that isn't in soup. prettyHTML = prettyHTML.replace("\'http:=\"\"", "\'http:=\"\"\'") # Make BETTER soup with better html formatting soup = BeautifulSoup(prettyHTML, "html.parser") #find the total number of reviews if (pageNumber == 1): numReviews = soup.find('div', {'class', 'BVRRHistogramTitle'}).find( 'span', {'class', 'BVRRNumber'}).string.strip() print("Total Number of Reviews: " + numReviews) if (numReviews <= 8): pages = 1 else: pages = (math.floor((int(numReviews) - 8) / 30)) + 2 pages = int(pages) global totalPages totalPages = pages # Find all the reviews (should be about 30-ish). allReviews = scraperFunks.findAllReviews(soup, "div", "id", "BVSubmissionPopupContainer") if (pageNumber == 1): j = 1 else: j = (9 + (30 * (pageNumber - 2))) for review in allReviews: print("Review Number: " + str(j)) nicknames = review.find('span', {'itemprop': 'author'}) print("Username: "******"Date: " + dateString) if (dateObj != -1): reviewDateObj = datetime.strptime(dateString, "%Y-%m-%d") if (reviewDateObj < dateObj): return False # Find all review information within the review container. reviewContainer = review.findAll( 'div', {'class': 'BVRRReviewDisplayStyle5BodyWrapper'}) # Gets review title from reviewContainer reviewTitle = review.find('span', {'itemprop': 'name'}) print("Review Title: " + reviewTitle.string.strip()) reviewScore = review.find('span', {'itemprop': 'ratingValue'}) print("Review Score: " + reviewScore.string.strip()) reviewText = review.find('div', { 'itemprop': 'description' }).findAll('span', {'class': 'BVRRReviewText'}) #print("Review Test: ") #for paragraph in reviewText: #print(paragraph.string.strip() + "\n") #print(paragraph.prettify()) j = j + 1 print("\n") return True