Beispiel #1
0
def scraper(modelNum, sheet, j):
    baseURL = "http://m.homedepot.com"

    allHTML = scraperFunks.getHTML(
        scraperFunks.createURL(modelNum, "http://m.homedepot.com/s/"))

    allReviewsOnPage = scraperFunks.findAllReviews(allHTML, 'li', 'id',
                                                   'reviews')

    # get the last date scraper was ran from the text file
    try:
        file = open("LastRanDate.txt", "r")
        stringDate = file.read()
        lastRanDate = (datetime.strptime(stringDate, "%Y-%m-%d")).date()
    except IOError:
        print "No file for date previously run"
        lastRanDate = -1

    # Check to see if there are no reviews
    if (len(allReviewsOnPage) == 0):
        print "Could not find product with model number " + modelNum
        return
    for rev in allReviewsOnPage:
        if (rev.find(string="No Reviews")):
            print("No Reviews for Model Number " + modelNum)
            return "No Reviews"

    reviewsPage = scraperFunks.findNumberOfReviewsText(
        allReviewsOnPage, 'a', 'class',
        'text-secondary flex space-between flex-grow-1')

    # Check for only one review
    if (reviewsPage == []):
        print("Only one review")
        return "Only one review"

    numReviews = scraperFunks.getNumReviews(reviewsPage, 'span', 'class',
                                            'text-primary')

    # determine number of pages of reviews with 10 reviews per page
    numPages = int(math.floor((numReviews / 10)))
    mod = (numReviews % 10)
    if mod > 0:
        numPages = numPages + 1

    link = reviewsPage.get('href')
    link = link.replace("sort=Most-helpful", "sort=Oldest")

    if (lastRanDate == -1):
        i = 0  # Current page
        count = 0
        while i < numPages:
            # get page of reviews
            pageOfReviewsHtml = scraperFunks.getHTML(
                scraperFunks.createURL(link, baseURL))

            # find all reviews on current page
            allReviewsOnPage = scraperFunks.findAllReviews(
                pageOfReviewsHtml, 'div', 'class',
                'reviews-entry p-top-normal p-bottom-normal sborder border-bottom border-default review static-height'
            )

            # Write reviews from page into excel sheet
            for review in allReviewsOnPage:
                count += 1
                sheet.write(j, 0, modelNum)

                starRating = str((review.find('div',
                                              {'class': 'stars'})).get('rel'))
                sheet.write(j, 2, starRating)

                reviewText = (review.find(
                    'p', {'class': 'review line-height-more'})).string
                sheet.write(j, 3, reviewText)

                reviewDate = (review.find(
                    'div', {'class': 'small text-muted right'})).string
                sheet.write(j, 1, reviewDate)

                helpful = (review.findAll('span',
                                          {'class': 'small m-left-small'}))

                # Helpful can sometimes not exist, check for that.
                if helpful is None:
                    j = j + 1
                    continue
                else:
                    # There can be multiple spans with same class. First is something like "Pro" or "DIY" or "Reccommended"
                    # Second is "x in y found this helpful". We want that one.
                    k = 0
                    wasHelpful = False
                    for item in helpful:
                        if "helpful" in item.string:
                            helpful = item
                            helpful = helpful.string
                            wasHelpful = True

                            break

                        if wasHelpful:
                            sheet.write(j, 4, helpful)
                            # print ("Review: " + str(j) + "\nHelpful Msg: " + helpful)
                        else:
                            sheet.write(j, 4, "Not Helpful")
                            # print ("Review: " + str(j) + " was not helpful.")

                response = ""

                j = j + 1

            if i != (numPages - 1):
                paginationSoup = pageOfReviewsHtml.find_all(
                    'ul', {'class': 'pagination'})
                allAElements = paginationSoup[0].find_all('a')
                link = allAElements[len(allAElements) - 1].get('href')

            i = i + 1
    else:
        i = numPages  # last page of reviews
        count = 0

        while i > 0:
            # change link to last page of reviews
            pageString = "/" + str(i) + "?"
            link = link.replace("/1?", pageString)

            # get page of reviews
            pageOfReviewsHtml = scraperFunks.getHTML(
                scraperFunks.createURL(link, baseURL))

            # find all reviews on current page
            allReviewsOnPage = scraperFunks.findAllReviews(
                pageOfReviewsHtml, 'div', 'class',
                'reviews-entry p-top-normal p-bottom-normal sborder border-bottom border-default review static-height'
            )

            numOfReviews = len(allReviewsOnPage) - 1
            noMoreNewReviews = False
            while numOfReviews >= 0:

                reviewDate = (allReviewsOnPage[numOfReviews].find(
                    'div', {'class': 'small text-muted right'})).string
                reviewDateTime = (datetime.strptime(reviewDate,
                                                    "%B %d, %Y")).date()
                # check if it is a new review
                if reviewDateTime <= lastRanDate:
                    noMoreNewReviews = True
                    break

                count += 1
                sheet.write(j, 0, modelNum)

                starRating = str((allReviewsOnPage[numOfReviews].find(
                    'div', {'class': 'stars'})).get('rel'))
                sheet.write(j, 2, starRating)

                reviewText = (allReviewsOnPage[numOfReviews].find(
                    'p', {'class': 'review line-height-more'})).string
                sheet.write(j, 3, reviewText)

                sheet.write(j, 1, reviewDate)

                helpful = (allReviewsOnPage[numOfReviews].findAll(
                    'span', {'class': 'small m-left-small'}))

                # Helpful can sometimes not exist, check for that.
                if helpful is None:
                    j = j + 1
                    numOfReviews = numOfReviews - 1
                    continue
                else:
                    # There can be multiple spans with same class. First is something like "Pro" or "DIY" or "Reccommended"
                    # Second is "x in y found this helpful". We want that one.
                    k = 0
                    wasHelpful = False
                    for item in helpful:
                        if "helpful" in item.string:
                            helpful = item
                            helpful = helpful.string
                            wasHelpful = True

                            break

                        if wasHelpful:
                            sheet.write(j, 4, helpful)
                            #print ("Review: " + str(j) + "\nHelpful Msg: " + helpful)
                        else:
                            sheet.write(j, 4, "Not Helpful")
                            #print ("Review: " + str(j) + " was not helpful.")

                    response = ""

                    j = j + 1
                    numOfReviews = numOfReviews - 1

            if noMoreNewReviews:
                break

            if i > 2:
                paginationSoup = pageOfReviewsHtml.find_all(
                    'ul', {'class': 'pagination'})
                allAElements = paginationSoup[0].find_all('a')
                link = allAElements[0].get('href')

            i = i - 1

    date = (datetime.today()).date()
    file = open("LastRanDate.txt", "w")
    file.write(str(date))
    file.close()

    print("Number of Reviews " + str(count) + " for productID " +
          str(modelNum))

    return j
Beispiel #2
0
def scrapePage(productID, pageNumber, csvFile):
    # Make TCP Request
    request = urllib.urlopen(ROOTURL1 + productID + ROOTURL2 +
                             str(pageNumber) + ROOTURL3)

    # Find where our reviews are located in BazaarVoice JavaScript
    count = 1
    for line in request.readlines():
        if count == 9:
            index = line.index("SourceID")
            string = line[index + 11:]
            string = string[:len(string) - 6]
        count += 1

    # Remove bad characters
    string = string.replace("<br />", "\t")
    string = string.replace("\\/", "/")
    string = string.replace("\\\"", "")
    string = string.replace(
        chr(ord(u'\xbd')),
        "")  # found this odd character "HALFWIDTH HANGUL LETTER PHIEUPH"
    string = string.replace(chr(ord(u'\xc2')), "")  # This one looks like a T.

    # Make soup with a string that is clean of bad characters
    soup = BeautifulSoup(string, "html.parser")
    prettyHTML = soup.prettify()

    # for some reason, missing an apostrophe in the wrong place did some weird things.
    # lets take care of that.
    # [WARNING]: prettyHTML will have extra stuff that isn't in soup.
    prettyHTML = prettyHTML.replace("\'http:=\"\"", "\'http:=\"\"\'")

    # Make BETTER soup with better html formatting
    soup = BeautifulSoup(prettyHTML, "html.parser")

    # Find all the reviews (should be about 30-ish).
    allReviews = scraperFunks.findAllReviews(soup, "div", "id",
                                             "BVSubmissionPopupContainer")
    j = 1
    for review in allReviews:
        print("Review Number: " + str(j))

        reviewDate = review.find('meta', {'itemprop': 'datePublished'})
        date = reviewDate['content'][:-1]
        print("Date: " + reviewDate['content'])

        nickname = review.find('span', {'itemprop': 'author'})
        print("Username: "******"Review Title: " + reviewTitle.string.strip())

        reviewScore = review.find('span', {'itemprop': 'ratingValue'})
        print("Review Score: " + reviewScore.string.strip())

        reviewTextSpans = review.find('div', {
            'itemprop': 'description'
        }).findAll('span', {'class': 'BVRRReviewText'})

        print("Review Text: ")
        reviewText = ""
        for paragraph in reviewTextSpans:
            print(paragraph.string.strip())
            reviewText = reviewText + paragraph.string.strip() + "\t"

        yesHelpful = review.find('a', {
            'title': 'helpful'
        }).find('span', {'class': 'BVDINumber'})
        print("How many people found this Helpful: " +
              yesHelpful.string.strip())

        notHelpful = review.find('a', {
            'title': 'unhelpful'
        }).find('span', {'class': 'BVDINumber'})
        print("How many people found this Not Helpful: " +
              notHelpful.string.strip())

        print("\n")

        csvFile.writerow((j, productID, date, reviewScore.string.strip(),
                          reviewText.strip(), yesHelpful.string.strip(),
                          notHelpful.string.strip()))

        j = j + 1
Beispiel #3
0
def scrapePage(productID, pageNumber, csvFile):
    global j
    # Make TCP Request
    request = urllib.urlopen(ROOTURL1 + productID + ROOTURL2 +
                             str(pageNumber) + ROOTURL3)

    # removes any output that cannot be processed by BeautifulSoup
    string = scraperFunks.prepareHTML(request)

    # Make soup with a string that is clean of bad characters
    soup = BeautifulSoup(string, "html.parser")
    prettyHTML = soup.prettify()

    out = open("test2.txt", "w")
    out.write(prettyHTML)
    out.close()

    # for some reason, missing an apostrophe in the wrong place did some weird things.
    # lets take care of that.
    # [WARNING]: prettyHTML will have extra stuff that isn't in soup.
    prettyHTML = prettyHTML.replace("\'http:=\"\"", "\'http:=\"\"\'")

    # Make BETTER soup with better html formatting
    soup = BeautifulSoup(prettyHTML, "html.parser")

    #find the total number of reviews
    if (pageNumber == 1):
        numReviews = soup.find('div', {'class', 'BVRRHistogramTitle'}).find(
            'span', {'class', 'BVRRNumber'}).string.strip()
        print("Total Number of Reviews: " + numReviews)
        if (numReviews <= 8):
            pages = 1
        else:
            pages = (math.floor((int(numReviews) - 8) / 30)) + 2
            pages = int(pages)
        global totalPages
        totalPages = pages

    # Find all the reviews (should be about 30-ish).
    allReviews = scraperFunks.findAllReviews(soup, "div", "class",
                                             "BVRRContentReview")

    # parse through each review on page
    for review in allReviews:
        revID = review['id'][27:]
        print("ReviewID: " + revID)

        #check the date of the review and compare it to
        reviewDate = review.find('meta', {'itemprop': 'datePublished'})
        #Format date output
        dateString = reviewDate['content'][:-1]
        print("Date: " + dateString)
        if (dateObj != -1):
            reviewDateObj = datetime.strptime(dateString, "%Y-%m-%d")
            if (reviewDateObj < dateObj):
                return False  #the review was outside the date range

        #Output for debugging purposes
        print("Review Number: " + str(j))

        #Extra Field that is not in current output (moved to Cycle 2)
        nicknames = review.find('span', {'itemprop': 'author'})
        username = nicknames.get_text().strip()
        print("Username: "******"Review Title: " + reviewTitle)

        EoI = ""
        quality = ""
        value = ""
        subRatingHolder = review.findAll('img', {'class': 'BVImgOrSprite'})
        if (len(subRatingHolder) > 1):
            EoI = subRatingHolder[1]['title']
            quality = subRatingHolder[2]['title']
            value = subRatingHolder[3]['title']

            print("EoI: " + EoI)
            print("Q: " + quality)
            print("V: " + value)

        # Gets review score
        reviewScore = review.find('span', {'itemprop': 'ratingValue'})
        print("Review Score: " + reviewScore.string.strip())

        # Gets review text
        reviewTextSpans = review.find('div', {
            'itemprop': 'description'
        }).findAll('span', {'class': 'BVRRReviewText'})
        print("Review Text: ")
        # Reviews can be separated into multiple <spans> if the review used newline, put them into one string for output
        reviewText = ""
        for paragraph in reviewTextSpans:
            reviewText = reviewText + paragraph.string.strip() + "\t"
        # Remove any special characters that cannot be saved into csv file (non ascii (128))
        reviewText = reviewText.strip().encode("ascii", "ignore")
        print(reviewText)

        # Gets the number of people who marked the review as helpful (if none recorded outputs 0)
        yesHelpful = review.find('a', {'title': 'helpful'})
        if (yesHelpful is not None):
            yesHelpful = yesHelpful.find('span', {
                'class': 'BVDINumber'
            }).string.strip()
            print("How many people found this Helpful: " + yesHelpful)
        else:
            yesHelpful = "0"

        # Gets the number of people who marked the review as not helpful (if none recorded outputs 0)
        notHelpful = review.find('a', {'title': 'unhelpful'})
        if (notHelpful is not None):
            notHelpful = notHelpful.find('span', {
                'class': 'BVDINumber'
            }).string.strip()
            print("How many people found this Not Helpful: " + notHelpful)
        else:
            notHelpful = "0"

        # Get the Response from Rheem if there is one
        responseClass = review.find('div',
                                    {'class': 'BVRRReviewClientResponseText'})
        responseText = ""
        if (responseClass is not None):
            response = review.find('span', {'class': 'BVRRPlainTextMarkup'})
            responseText = response.get_text().strip().encode(
                "ascii", "ignore")
        else:
            responseText = "No Response"

        print("Response from Rheem: " + responseText)

        #Get Recommended Product True or False
        recommended = review.find(
            'div', {'class': 'BVRRReviewDisplayStyle5AdditionalWrapper'})
        recommended = recommended.find('div',
                                       {'class': 'BVRRRecommendedContainer'})
        isRecommended = True
        try:
            recommended = recommended.a.extract().get_text()
        except:
            isRecommended = False
        print("Is this product recommended? " + str(isRecommended))

        #Get media indicator
        hasMedia = review.find('div',
                               {'class': 'BVRRReviewDisplayStyle5Media'})
        isMedia = False
        if (hasMedia is not None):
            isMedia = True
        print("Does this product have photo/video? " + str(isMedia))

        link = review.find('div', {'class': 'BVRRUserNicknameContainer'})
        reviewLink = ""
        try:
            link = link.a.extract()
            reviewLink = link['href']
        except:
            reviewLink = "http://www.homedepot.com/s/" + str(productID)

        print("Link to Review " + reviewLink)

        print("\n")

        #Write all review fields to csv file in the correct order (must match connection profile mapping!)
        csvFile.writerow(
            (j, revID, productID, username, reviewTitle, dateString,
             reviewScore.string.strip(), EoI, quality, value, reviewText,
             yesHelpful, notHelpful, responseText, str(isRecommended),
             str(isMedia), reviewLink))

        j = j + 1  #increments the review key counter

    return True  #the review was inside the date range
Beispiel #4
0
def scrapePage(productID, pageNumber, csvFile):
    # Make TCP Request
    request = urllib.urlopen(ROOTURL1 + productID + ROOTURL2 +
                             str(pageNumber) + ROOTURL3)

    string = scraperFunks.prepareHTML(request)

    # Make soup with a string that is clean of bad characters
    soup = BeautifulSoup(string, "html.parser")
    prettyHTML = soup.prettify()

    # for some reason, missing an apostrophe in the wrong place did some weird things.
    # lets take care of that.
    # [WARNING]: prettyHTML will have extra stuff that isn't in soup.
    prettyHTML = prettyHTML.replace("\'http:=\"\"", "\'http:=\"\"\'")

    # Make BETTER soup with better html formatting
    soup = BeautifulSoup(prettyHTML, "html.parser")

    #find the total number of reviews
    if (pageNumber == 1):
        numReviews = soup.find('div', {'class', 'BVRRHistogramTitle'}).find(
            'span', {'class', 'BVRRNumber'}).string.strip()
        print("Total Number of Reviews: " + numReviews)
        if (numReviews <= 8):
            pages = 1
        else:
            pages = (math.floor((int(numReviews) - 8) / 30)) + 2
            pages = int(pages)
        global totalPages
        totalPages = pages

    # Find all the reviews (should be about 30-ish).
    allReviews = scraperFunks.findAllReviews(soup, "div", "id",
                                             "BVSubmissionPopupContainer")
    if (pageNumber == 1):
        j = 1
    else:
        j = (9 + (30 * (pageNumber - 2)))

    for review in allReviews:
        print("Review Number: " + str(j))

        nicknames = review.find('span', {'itemprop': 'author'})
        print("Username: "******"Date: " + dateString)
        if (dateObj != -1):
            reviewDateObj = datetime.strptime(dateString, "%Y-%m-%d")
            if (reviewDateObj < dateObj):
                return False

        # Find all review information within the review container.
        reviewContainer = review.findAll(
            'div', {'class': 'BVRRReviewDisplayStyle5BodyWrapper'})

        # Gets review title from reviewContainer
        reviewTitle = review.find('span', {'itemprop': 'name'})
        print("Review Title: " + reviewTitle.string.strip())

        reviewScore = review.find('span', {'itemprop': 'ratingValue'})
        print("Review Score: " + reviewScore.string.strip())

        reviewText = review.find('div', {
            'itemprop': 'description'
        }).findAll('span', {'class': 'BVRRReviewText'})

        #print("Review Test: ")
        #for paragraph in reviewText:
        #print(paragraph.string.strip() + "\n")
        #print(paragraph.prettify())

        j = j + 1
        print("\n")

    return True