Beispiel #1
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     print("review from seniordatingsites.com")
     # https://www.highya.com/coinbase-reviews
     for node in response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-review']"):
         reviews.append(node.xpath('string()').extract());
     ratings1 = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-stars']/img/@src").extract()
     i = 0
     ratings = []
     while i < len(ratings1):
         star = getStarts(ratings1[i])
         ratings.append(str(star))
         i = i + 1
     ratings = map(lambda foo: foo.replace('.', ''), ratings)
     # dates = response.xpath("//div[@class='review-sub-cntnr']/div[@class='review-one-all']/div[@class='review-profile']/div[@class='review-mid']/p/text()").extract()
     # img_src = response.xpath("//div[@class='logo-profile']/img/@src").extract()
     authors = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-name']/text()").extract()
     website_name = response.xpath("//div[@id='container']/div[@id='header']/div[@class='left eight columns']/div/a[@class='logo']/img/@title").extract()
     print(" Ratings ", len(ratings), ratings)
     # print("dates ", len(dates), dates)
     print(" Reviews ", len(reviews), reviews)
     # print(" headings ", len(headings), headings)
     print(" authors ", len(authors), authors)
     print(" website_name ", len(website_name), website_name)
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item], None, None, authors[item],
                                      category, servicename, reviews[item], None, website_name)
         servicename1.save()
Beispiel #2
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     #TODO raiting coming in percentage
     # https://top11hosting.com/hostgator-review/
     for node in response.xpath(
             "//div[@class='wpcr3_item wpcr3_business']/div/blockquote[@class='wpcr3_content']"
     ):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class='wpcr3_rating_style1_average']/@style").extract()
     ratings.pop(0)
     ratings.pop(0)
     dates = response.xpath(
         "//div[@class='wpcr3_review_datePublished']/text()").extract()
     # headings = response.xpath("//div[@class='width64 floatleft']/h4[3]").extract()
     authors = response.xpath(
         "//div[@class='wpcr3_review_author']/span[@class='wpcr3_caps']/text()"
     ).extract()
     website_name = response.xpath(
         "//div[@class='wpcr3_item_name']/a/text()").extract()
     ratings1 = []
     i = 0
     while i < len(ratings):
         c = int(getStarts(ratings[i])) / 20
         ratings1.append(str(c))
         i = i + 1
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings1[item], None,
                                      dates[item], authors[item], category,
                                      servicename, reviews[item], None,
                                      website_name)
         servicename1.save()
Beispiel #3
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        #print("webshostingFatcow.com")
        authors = response.xpath(
            "//div[@class='comment-user-left name']/text()").extract()
        dates = response.xpath(
            "//div[@class='comment-user-left date']/text()").extract()
        website_name = response.xpath(
            "//div[@id='line']/a[1]/img/@alt").extract()
        headings = response.xpath(
            "//div[@class='comments_user_comment']/a/text()").extract()
        ratings1 = response.xpath(
            "//div[@class='comment_user_star_rate']/div[@class='comment_user_stars']/img/@src"
        ).extract()
        if len(ratings1) == 0:
            ratings1 = response.xpath(
                "//div[@class='rating pure-u-1 pure-u-lg-1-3']/img[@class='stars overall']/@alt"
            ).extract()
        ratings = []
        while i < len(ratings1):
            ratings.append(getStarts(ratings1[i]))
            # print(getStarts(ratings1[i]))
            i = i + 1
        ratings = map(lambda foo: foo.replace('-', ''), ratings)
        ratings = map(lambda foo: foo.replace('.', ''), ratings)
        sum = 0
        ratings2 = []
        for i in range(len(ratings)):
            if i % 5 != 0 and i != 0:
                sum = sum + int(ratings[i])
            else:
                if i != 0:
                    c = sum / 5.0
                    ratings2.append(str(c))
                sum = 0
                sum = sum + int(ratings[i])

        c = sum / 5.0
        ratings2.append(str(c))
        for node in response.xpath('//div[@class="comment-body"]'):
            reviews.append(node.xpath('string()').extract())
        if len(reviews) == 0:
            for node in response.xpath(
                    '//div[@class="comment pure-u-1 pure-u-lg-2-3 wcc"]'):
                reviews.append(node.xpath('string()').extract())

        for item in range(0, len(reviews)):

            servicename1 = ServiceRecord(response.url, ratings2[item],
                                         headings[item], dates[item],
                                         authors[item], category, servicename,
                                         reviews[item], None, website_name)
            servicename1.save()
    def str11(self):
        store_data_dict = {}

        # datetime.datetime(2010, 2, 15, 0, 0)
        return {
            "absolute_url": self.url,
            "rating": utils.getStarts(self.rating),
            "review_title": self.heading,
            "reviewed_at": utils.convertDate(self.date),
            "reviewer_name": self.author,
            "category": self.category,
            "service_name": self.service_name,
            "review_text": self.reviews[0],
            "picture_urls": self.img_src,
            "website_name":self.website_name}
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from blackpeoplemeet.pissedconsumer.com")
        for node in response.xpath(
                "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/div"
        ):
            reviews.append(node.xpath('string()').extract())
        ratings1 = response.xpath(
            "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/font/img/@src"
        ).extract()
        dates1 = response.xpath(
            "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/font/span[1]/text()"
        ).extract()
        headings = response.xpath(
            "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/font/a/text()"
        ).extract()
        # authors = response.xpath("//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']").extract()
        website_name = response.xpath(
            "//div[@class='wpcr3_item_name']/a/text()").extract()
        # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract()
        sum = 0
        c = 0
        ratings = []
        dates1 = map(lambda foo: foo.replace(u'\xa0', u''), dates1)
        dates1 = map(lambda foo: foo.replace('By', ''), dates1)
        dates = []
        dates2 = []
        authors = []
        authorsCount = 0
        # authors = map(lambda foo: foo.replace('By', ''), authors)
        while authorsCount < len(dates1):
            authorDetails = dates1[authorsCount].split('|')
            authors.append(authorDetails[0])
            dates.append(authorDetails[1])
            authorsCount = authorsCount + 1
        authorsCount = 0
        while authorsCount < len(dates):
            authorDetails = dates1[authorsCount].split('      ')
            dates2.append(authorDetails[1])
            authorsCount = authorsCount + 1

        i = 0
        while i < len(ratings1):
            star = getStarts(ratings1[i])
            if (star == '01.'):
                ratings.append(5)
            else:
                ratings.append(0)
            i = i + 1

        ratings2 = []
        for i in range(len(ratings)):
            if i % 5 != 0 and i != 0:
                sum = sum + int(ratings[i])
            else:
                if i != 0:
                    c = sum / 5.0
                    ratings2.append(str(round(c, 2)))
                sum = 0
                sum = sum + int(ratings[i])
        c = sum / 5.0
        ratings2.append(str(round(c, 2)))
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings2[item],
                                         headings[item], dates2[item],
                                         authors[item], category, servicename,
                                         reviews[item], None, website_name)
            servicename1.save()

        next_page = response.xpath(
            "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='viciao']/div[@class='page_cut']/a[@class='cur_pageva']/@href"
        ).extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url)
                # yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)