def crawl(self, response, category, servicename):
     reviews = []
     #http://pickuphost.com/review/bluehost/#customer_review_shap
     for node in response.xpath(
             "//div[@class='one_rew']/div[@class='rewiwer_post']/span"):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class='col-md-12 avg_ratting_bg text-center']/div[@class='avg_ratting text-center']/text()"
     ).extract()
     headings = response.xpath(
         "//div[@id='rew_replace_div']/div[@class='one_rew']/h4/b/text()"
     ).extract()
     dates = response.xpath(
         "//div[@id='rew_replace_div']/div[@class='one_rew']/span[@class='rewiwer_data']/span[2]/text()"
     ).extract()
     authors = response.xpath(
         "//div[@id='rew_replace_div']/div[@class='one_rew']/span[@class='rewiwer_data']/span[1]/text()"
     ).extract()
     #TODO website name pending-- done
     name = response.xpath(
         "//div[@class='navbar-header']/a/@href").extract()
     website_name = name[0].split(".")[0].split("/")[-1]
     for item in range(1, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item],
                                      headings[item], dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], "", website_name)
         servicename1.save()
Beispiel #2
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     print("review from blackpeoplemeet.pissedconsumer.com")
     for node in response.xpath(
             "//div[@class='middleware-review-container'][1]/div/div[@class='f-component-info']/div[@class='f-component-text']/div[@class='overflow-text']"
     ):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock  ']/div[@class='comment  ']/ul[@class='postby']/li[2]/span[@class='smallStars']/@data-score"
     ).extract()
     dates = response.xpath(
         "div[@class='middleware-review-container']/div/div[@class='f-component-info']/div[@class='f-component-info-header']/time[@class='post-time secondary-info']/text()"
     ).extract()
     # headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract()
     # authors = response.xpath("//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']").extract()
     website_name = response.xpath(
         "//div[@class='wpcr3_item_name']/a/text()").extract()
     # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract()
     #print("Reviews ", len(reviews), reviews)
     # print("Headings ", len(headings), headings)
     # print("Authors ", len(authors), authors)
     #print("Rating ", len(ratings), ratings)
     #print("Dates ", len(dates), dates)
     # print("Img_src ", len(img_src), img_src)
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item], None,
                                      dates[item], None, category,
                                      servicename, reviews[item], None,
                                      website_name)
         servicename1.save()
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        # https://www.yelp.com/biz/fatcow-burlington
        for node in response.xpath('//div[@class="review-content"]'):
            reviews.append(node.xpath('string()').extract())

        ratings =  response.xpath("//div[@class='biz-rating biz-rating-large clearfix']/div/div/@title").extract()
        dates =  response.xpath("//div[@class='biz-rating biz-rating-large clearfix']/span[@class='rating-qualifier']/text()").extract()
        authors =  response.xpath("//div[@class='media-story']/ul[@class='user-passport-info']/li[@class='user-name']/a[@id='dropdown_user-name']/text()").extract()
        website_name =  response.xpath("//html/head/meta[6]/@content").extract()
        print(" Ratings ", len(ratings), ratings)
        print("dates ", len(dates), dates)
        print(" Reviews ", len(reviews), reviews)
        # print(" headings ", len(headings), headings)
        print(" authors ", len(authors), authors)
        print(" website_name ", len(website_name), website_name)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url,ratings[item],None,dates[item],authors[item],category,servicename,reviews[item],"",website_name);
            servicename1.save()

        next_page = response.xpath("//div[@class='arrange_unit']/a[@class='u-decoration-none next pagination-links_anchor']/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url, "    url")
                yield response.follow(url=next_page_url, callback=self.parsing)
Beispiel #4
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     authors = response.xpath(
         "//div[@class='comment-meta commentmetadata']/cite[@class='fn']/text()"
     ).extract()
     dates = response.xpath(
         "//div[@class='comment-time']/a/time/text()").extract()
     #print("Authors   ", authors)
     #print("dates   ", dates)
     for node in response.xpath("//div[@class='comment-text']/span"):
         reviews.append(node.xpath('string()').extract())
     print("reviews ", reviews)
     img_src = response.xpath(
         "//div[@class='vcard-wrap']/img[@class='avatar avatar-100 wp-user-avatar wp-user-avatar-100 photo avatar-default']/@src"
     ).extract()
     # ratings = response.xpath("//div[@class='star_rating']/@title").extract()
     website_name = response.xpath("///html/head/title/text()").extract()
     #print("img_src   ", img_src)
     #print("websitesName   ", website_name)
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, None, dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], img_src, website_name)
         servicename1.save()
     next_page = response.xpath(
         "//div[@class='nav-previous']/a/@href").extract()
     if next_page is not None:
         next_page_url = "".join(next_page)
         if next_page_url and next_page_url.strip():
             print(type(next_page_url))
             print(next_page_url)
             yield response.follow(url=next_page_url, callback=self.parsing)
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     print("review from restoreprivacy.com")
     # https://www.highya.com/coinbase-reviews
     for node in response.xpath("//div[@class='comment-text-inner']"):
         reviews.append(node.xpath('string()').extract())
     # ratings = response.xpath("//div[@class='wpcr3_rating_style1_average']/@style").extract()
     dates = response.xpath(
         "//div[@class='comment-author vcard']/span[@class='ago']/text()"
     ).extract()
     # headings = response.xpath("//div[@class='width64 floatleft']/h4[3]").extract()
     authors = response.xpath(
         "//div[@class='comment-author vcard']/span[@class='fn']/span/text()"
     ).extract()
     img_src = response.xpath(
         "//div[@class='comment-author vcard']/img[@class='avatar avatar-50 photo']/@src"
     ).extract()
     website_name = response.xpath(
         "//div[@class='title-area']/p[@class='site-title']/a/text()"
     ).extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, None, dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], None, website_name)
         servicename1.save()
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from productreview.com")
        # https://www.productreview.com.au/p/smart-fares.html
        #TODO date missing--done
        for node in response.xpath("//div[@class='review-overall']"):
            reviews.append(node.xpath('string()').extract());
        ratings =  response.xpath("//div[@class='rating-md']/p/span/span[@itemprop='ratingValue']/@content").extract()
        headings = response.xpath("//div[@class='review-content']/h3/text()").extract()
        dates =  response.xpath("//div[@class='review-content']/div[@class='rating-md']/p/meta/@content").extract()
        authors = response.xpath("//div[@class='review-author']/h6/a/text()").extract()
        img_src =  response.xpath("//div[@class='item-header-img']/span[@class='item-header-img-container']/img/@src").extract()
        website_name =  response.xpath("/html/head/meta[7]/@content").extract()
        dates = response.xpath("//div[@class='review-content']/div[@class='rating-md']/p/meta/@content").extract()
        print("dates ", len(dates), dates)
        print(" Reviews ", len(reviews), reviews)
        print(" headings ", len(headings), headings)
        print(" authors ", len(authors), authors)
        print(" website_name ", len(website_name), website_name)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category,
                          servicename, reviews[item],img_src,website_name);
            servicename1.save()

        next_page = response.xpath("//div[@class='pagination-container']/ul[@class='pagination']/li[7]/a/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url)
                # yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)
 def crawl(self, response, category, servicename):
     reviews = []
     print("Reviews from Capterra.con")
     # https://www.capterra.com/p/170765/ExpressVPN/
     for node in response.xpath(
             '//div[@class="review-comments  color-text"]'):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class='overall-rating-container']/span[@class='overall-rating']/span/text()"
     ).extract()
     headings = response.xpath(
         "//div[@class='cell seven-eighths  palm-one-whole']/h3/q/text()"
     ).extract()
     dates = response.xpath(
         "//div[@class='grid']/div[@class='cell one-eighth  palm-one-whole']/div[@class='quarter-margin-bottom  micro  color-gray  weight-normal  text-right  palm-text-left']/text()"
     ).extract()
     img_src = response.xpath(
         "//div[@class='thumbnail  no-hover  listing-thumbnail']/img/@src"
     ).extract()
     website_name = response.xpath(
         "//div[@class='site-logo-wrapper']/a/img[@class='site-logo']/@alt"
     ).extract()
     for item in range(0, len(reviews)):
         service1 = ServiceRecord(response.url, ratings[item],
                                  headings[item], dates[item], None,
                                  category, servicename, reviews[item],
                                  img_src, website_name)
         service1.save()
Beispiel #8
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        #print("whoishostingthis.com")
        # https://www.whoishostingthis.com/hosting-reviews/bluehost/
        authors = response.xpath("//div[@class='author']/span[@class='name']/text()").extract()
        img_src = response.xpath("//div[@class='host-info wcc']/a[1]/img[@class=' logo']/@src").extract()
        website_name = response.xpath("//div[@class='mobile']/a[@class='home']/img[@class='logo']/@alt").extract()
        ratings1 = response.xpath("//div[@class='user-info pure-u-1']/img[@class='stars overall']/@alt").extract()
        if len(ratings1) == 0 :
            ratings1 = response.xpath("//div[@class='rating pure-u-1 pure-u-lg-1-3']/img[@class='stars overall']/@alt").extract()
        for node in response.xpath('//div[@class="comment pure-u-1 wcc"]'):
            reviews.append(node.xpath('string()').extract());
        if len(reviews) == 0:
            for node in response.xpath('//div[@class="comment pure-u-1 pure-u-lg-2-3 wcc"]'):
                reviews.append(node.xpath('string()').extract());
        #print("  reviews   ", reviews)
        dates = response.xpath("//div[@class='user-info pure-u-1']/time[@class='published']/text()").extract()
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings1[item], None, None, authors[item], category,
                          servicename, reviews[item],img_src,website_name);
            servicename1.save()

        next_page = response.xpath("//div[@class ='see-more']/a/@ href").extract()
        if len(next_page) == 0:
            next_page = response.xpath("//div[@class ='pure-u-1 pure-u-lg-1-4 next']/a/@ href").extract()
        if next_page is not None:
            next_page_url ="".join(next_page)
            if next_page_url and next_page_url.strip():
                #print(type(next_page_url))
                #print(next_page_url)
                #yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)
 def crawl(self, response, category, servicename):
     reviews = []
     print("review from Hostingfacts.com")
     # https://hostingfacts.com/hosting-reviews/hostgator-wordpress-managed/
     for node in response.xpath('//div[@class="user-review-content"]'):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class= 'user-review']/header/section/span[@class='user-review-rating']/span[@class='value']/text()"
     ).extract()
     dates = response.xpath(
         "//div[@class= 'user-review']/header/section/span[@class='user-review-meta']/text()"
     ).extract()
     headings = response.xpath(
         "//div[@class= 'user-review']/section/p[@class='user-review-title']/text()"
     ).extract()
     authors = response.xpath(
         "//div[@class='user-review']/header/section/p[@class='user-review-name']/a/span/text()"
     ).extract()
     img_src = response.xpath(
         "//div[@class='sidebar-padder']/aside/img[@class='img-responsive banner-image center-block']/@src"
     ).extract()
     website_name = response.xpath(
         "//div[@class='navbar-header']/a[@class='navbar-brand']/text()"
     ).extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item],
                                      headings[item], dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], img_src, website_name)
         servicename1.save()
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from macupdate.com")
        for node in response.xpath("//div/div[@class='yui3-u rcpb-content']/p[@class='rcpb-revcom-content']"):
            reviews.append(node.xpath('string()').extract())
        ratings = response.xpath("//div/div[@class='yui3-u rcpb-content']/div/input/@value/@text()").extract()
        dates = response.xpath("//div/div[@class='yui3-u rcpb-content']/span[@class='rcpb-postdate']/text()").extract()
        headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract()
        authors = response.xpath("//div[@class='box col-12 review-info']/strong/span/text()").extract()
        website_name = response.xpath("//div[@class='wpcr3_item_name']/a/text()").extract()
        img_src = response.xpath("//div[@class='avatar']/img/@src").extract()
        print(" raaaaaa")

        for i in range(len(ratings)):
            if i != 0:
                del [ratings]
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item],
                                         category, servicename, reviews[item], img_src, website_name)
            servicename1.save()

        next_page = response.xpath("//div[@class ='navigator']/a[7]/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url)
                # yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)
Beispiel #11
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     #TODO raiting coming in percentage
     # https://top11hosting.com/hostgator-review/
     for node in response.xpath(
             "//div[@class='wpcr3_item wpcr3_business']/div/blockquote[@class='wpcr3_content']"
     ):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class='wpcr3_rating_style1_average']/@style").extract()
     ratings.pop(0)
     ratings.pop(0)
     dates = response.xpath(
         "//div[@class='wpcr3_review_datePublished']/text()").extract()
     # headings = response.xpath("//div[@class='width64 floatleft']/h4[3]").extract()
     authors = response.xpath(
         "//div[@class='wpcr3_review_author']/span[@class='wpcr3_caps']/text()"
     ).extract()
     website_name = response.xpath(
         "//div[@class='wpcr3_item_name']/a/text()").extract()
     ratings1 = []
     i = 0
     while i < len(ratings):
         c = int(getStarts(ratings[i])) / 20
         ratings1.append(str(c))
         i = i + 1
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings1[item], None,
                                      dates[item], authors[item], category,
                                      servicename, reviews[item], None,
                                      website_name)
         servicename1.save()
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     # https://www.sitejabber.com/reviews/zoosk.com
     for node in response.xpath('//div[@class="review "]/p'):
         reviews.append(node.xpath('string()').extract());
     ratings = response.xpath("//div[@class='star_rating']/@title").extract()
     dates = response.xpath("//div[@class='time tiny_text faded_text']/text()").extract()
     headings = response.xpath("//div[@class='review_title']/a/text()").extract()
     authors1 = response.xpath("//div[@class='author_name']").extract()
     authors = []
     for content in authors1:
         root = etree.fromstring(content)
         if(root.text == None ):
             for element in root:
                 authors.append(element.text)
         else:
             authors.append(root.text)
     website_name = response.xpath("//div[@id='header_top']/a[@id='header_logo']/picture/img/@alt").extract()
     #print(authors)
     for item in range(0, len(reviews)):
         servicename1 =ServiceRecord(response.url, ratings[item],headings[item], dates[item], authors[item], category,
                       servicename, reviews[item], None,website_name);
         servicename1.save()
     next_page = response.xpath("// div[ @class ='paginator_next']/span/a[@class ='button outline']/@href").extract()
     if next_page is not None:
         next_page_url ="".join(next_page)
         if next_page_url and next_page_url.strip():
             #print(type(next_page_url))
             #print(next_page_url)
             yield response.follow(url=next_page_url, callback=self.parsing)
Beispiel #13
0
 def crawl(self, response, category, servicename):
     reviews = []
     # https://webhostinggeeks.com/providers/hostgator?product=shared
     for node in response.xpath('//div[@class="text_description"]'):
         reviews.append(node.xpath('string()').extract())
     dates = response.xpath(
         "//div[@class='top_line']/span/text()").extract()
     headings = response.xpath(
         "//div[@class='info_description']/p[@class='title_description ']/a/text()"
     ).extract()
     authors = response.xpath(
         "//div[@class='user-text']/p/text()").extract()
     website_name = response.xpath("/html/head/meta[9]/@content").extract()
     #print("Reviews ", len(reviews), reviews)
     #print("Headings ", len(headings), headings)
     #print("Authors ", len(authors), authors)
     # print("Rating ", len(ratings), ratings)
     #print("Dates ", len(dates), dates)
     # print("Img_src ", len(img_src), img_src)
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, headings[item],
                                      dates[item], authors[item], category,
                                      servicename, reviews[item], "",
                                      website_name)
         servicename1.save()
Beispiel #14
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     print("review from vpnmentor.com")
     # https://www.highya.com/coinbase-reviews
     for node in response.xpath(
             "//div[@class='review-item style_prevu_kit ']/div[@class='review-content']/p"
     ):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath("//div[@class='rate']/ul/li/text()").extract()
     dates = response.xpath(
         "//div[@class='row']/div[@class='col-md-4 col-xs-5']/div[@class='user']/div[@class='text-wrap']/h6/text()"
     ).extract()
     headings = response.xpath(
         "//div[@class='row']/div[@class='col-md-6 col-md-pull-2 col-xs-12']/div[@class='topic']/span/text()"
     ).extract()
     authors = response.xpath(
         "//div[@class='text-wrap']/h5/text()").extract()
     # img_src = response.xpath("//div[@class='img-wrap']/div/").extract()
     website_name = response.xpath(
         "//div[@class='wpcr3_item_name']/a/text()").extract()
     print(" Ratings ", len(ratings), ratings)
     print("dates ", len(dates), dates)
     print(" Reviews ", len(reviews), reviews)
     print(" headings ", len(headings), headings)
     print(" authors ", len(authors), authors)
     # print("img_Src ", len(img_src), img_src)
     print(" website_name ", len(website_name), website_name)
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item],
                                      headings[item], dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], None, website_name)
         servicename1.save()
Beispiel #15
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     print("review from seniordatingsites.com")
     # https://www.highya.com/coinbase-reviews
     for node in response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-review']"):
         reviews.append(node.xpath('string()').extract());
     ratings1 = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-stars']/img/@src").extract()
     i = 0
     ratings = []
     while i < len(ratings1):
         star = getStarts(ratings1[i])
         ratings.append(str(star))
         i = i + 1
     ratings = map(lambda foo: foo.replace('.', ''), ratings)
     # dates = response.xpath("//div[@class='review-sub-cntnr']/div[@class='review-one-all']/div[@class='review-profile']/div[@class='review-mid']/p/text()").extract()
     # img_src = response.xpath("//div[@class='logo-profile']/img/@src").extract()
     authors = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-name']/text()").extract()
     website_name = response.xpath("//div[@id='container']/div[@id='header']/div[@class='left eight columns']/div/a[@class='logo']/img/@title").extract()
     print(" Ratings ", len(ratings), ratings)
     # print("dates ", len(dates), dates)
     print(" Reviews ", len(reviews), reviews)
     # print(" headings ", len(headings), headings)
     print(" authors ", len(authors), authors)
     print(" website_name ", len(website_name), website_name)
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item], None, None, authors[item],
                                      category, servicename, reviews[item], None, website_name)
         servicename1.save()
Beispiel #16
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     #print("review from vpnranks.com")
     # https://www.highya.com/coinbase-reviews
     for node in response.xpath("//div[@class='comment-body']"):
         reviews.append(node.xpath('string()').extract())
     # ratings = response.xpath("//div[@class='rate']/ul/li/text()").extract()
     dates = response.xpath(
         "//div[@class='comment-meta commentmetadata']/a/text()").extract()
     # headings = response.xpath("//div[@class='row']/div[@class='col-md-6 col-md-pull-2 col-xs-12']/div[@class='topic']/span/text()").extract()
     authors1 = response.xpath(
         "//div[@class='comment-author vcard']").extract()
     authors = []
     for content in authors1:
         root = etree.fromstring(content)
         if (root.text == None):
             for element in root:
                 authors.append(element.text)
         else:
             for element in root:
                 authors.append(element.text)
     for i in range(len(authors) / 2 + 1):
         if i != 0:
             del authors[i]
     website_name = response.xpath(
         "//div[@class='wpcr3_item_name']/a/text()").extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, None, dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], None, website_name)
         servicename1.save()
Beispiel #17
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from hostingcharges.in")
        # https://www.highya.com/coinbase-reviews
        for node in response.xpath(
                "//div[@class='review-cntnr']/div[@class='review-sub-cntnr']/div[@class='review-one-all']/p"
        ):
            reviews.append(node.xpath('string()').extract())
        ratings = response.xpath(
            "//div[@class='review-one-all']/div[@class='lftfeatures']/div/div/input/@value"
        ).extract()
        headings = response.xpath(
            "//div[@class='review-mid']/p/text()").extract()
        #TODO code pending giving error
        ratings1 = []
        for i in range(len(ratings)):
            if i % 4 != 0 and i != 0:
                sum = sum + int(ratings[i])
            else:
                if i != 0:
                    c = sum / 4.0
                    ratings1.append(str(c))
                sum = 0
                sum = sum + int(ratings[i])

        c = sum / 4.0
        ratings1.append(str(c))
        dates = response.xpath(
            "//div[@class='review-sub-cntnr']/div[@class='review-one-all']/div[@class='review-profile']/div[@class='review-mid']/p/text()"
        ).extract()
        img_src = response.xpath(
            "//div[@class='logo-profile']/img/@src").extract()
        authors = response.xpath(
            "//div[@class='review-mid']/h4/text()").extract()
        website_name = response.xpath(
            "//div[@class='wpcr3_item_name']/a/text()").extract()
        print(" Ratings ", len(ratings1), ratings1)
        reviews = [[s.strip() for s in nested] for nested in reviews]
        i = 0
        count = 0
        while i < len(reviews):
            if reviews[i][0] == '':
                del reviews[i]
                count = count + 1
            i = i + 1
        print("dates ", len(dates), dates)
        print(" Reviews ", len(reviews), reviews)
        print(" headings ", len(headings), headings)
        print(" authors ", len(authors), authors)
        print(" website_name ", len(website_name), website_name)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings1[item],
                                         headings[item], dates[item],
                                         authors[item], category, servicename,
                                         reviews[item], img_src, website_name)
            servicename1.save()
Beispiel #18
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from HighYa.com")
        # https://www.highya.com/coinbase-reviews
        for node in response.xpath(
                "//div[@class='left-col col-lg-8 col-lg']/div[@id='reviews']/ul[@class='no-list list-review']/li/span/div[@class='description']"
        ):
            reviews.append(node.xpath('string()').extract())
        ratings = response.xpath(
            "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/span[@class='review']/meta[@itemprop='ratingValue']/@content"
        ).extract()
        dates = response.xpath(
            "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/ul[@class='list-line options']/li[last()-1]/text()"
        ).extract()
        headings = response.xpath(
            "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/h3[@class='title']/text()"
        ).extract()
        #TODO some times auther name structure differ not anchor tag need to check
        # authors = response.xpath("//div[@id='reviews']/ul[@class='no-list list-review']/li/span/ul[@class='list-line options']/li[1]/a/span/text()").extract()
        authors1 = response.xpath(
            "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/ul[@class='list-line options']/li[1]"
        ).extract()
        authors = []
        for content in authors1:
            print(content)
            root = etree.fromstring(content)
            print("rootttttt    ", root.text)
            break

            # if (root.text == None):
            #     for element in root:
            #         authors.append(element.text)
            # else:
            #     authors.append(root.text)
        website_name = response.xpath("//html/head/meta[7]/@content").extract()
        print(" Ratings ", len(ratings), ratings)
        print("dates ", len(dates), dates)
        print(" Reviews ", len(reviews), reviews)
        print(" headings ", len(headings), headings)
        print(" authors ", len(authors), authors)
        print(" website_name ", len(website_name), website_name)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings[item],
                                         headings[item], dates[item],
                                         authors[item], category, servicename,
                                         reviews[item], None, website_name)
            servicename1.save()

        next_page = response.xpath(
            "//div[@class='pagination']/a[@class='next']/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url, "    url")
                yield response.follow(url=next_page_url, callback=self.parsing)
Beispiel #19
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from yscam.com")
        for node in response.xpath(
                "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock  ']/div[@class='comment  ']/div"
        ):
            reviews.append(node.xpath('string()').extract())
        reviews = [[s.strip() for s in nested] for nested in reviews]
        i = 0
        count = 0
        while i < len(reviews):
            if reviews[i][0] == 'Mark as Useful' or reviews[i][
                    0] == 'Post Reply':
                del reviews[i]
                count = count + 1
                while (i < len(reviews)
                       and (reviews[i][0] == 'Mark as useful'
                            or reviews[i][0] == 'Post Reply')):
                    del reviews[i]
                    count = count + 1
            i = i + 1
        ratings = response.xpath(
            "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock  ']/div[@class='comment  ']/ul[@class='postby']/li[2]/span[@class='smallStars']/@data-score"
        ).extract()
        dates = response.xpath(
            "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock  ']/div[@class='comment  ']/ul[@class='postby']/li[1]/text()"
        ).extract()
        # headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract()
        # authors = response.xpath("//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']").extract()
        website_name = response.xpath(
            "//div[@class='wpcr3_item_name']/a/text()").extract()
        # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract()
        print("Reviews ", len(reviews), reviews)
        # print("Headings ", len(headings), headings)
        # print("Authors ", len(authors), authors)
        print("Rating ", len(ratings), ratings)
        print("Dates ", len(dates), dates)
        # print("Img_src ", len(img_src), img_src)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings[item], None,
                                         dates[item], None, category,
                                         servicename, reviews[item], None,
                                         website_name)
            servicename1.save()

        next_page = response.xpath(
            "//div[@class ='navigator']/a[7]/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url)
                # yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)
Beispiel #20
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        #print("webshostingFatcow.com")
        authors = response.xpath(
            "//div[@class='comment-user-left name']/text()").extract()
        dates = response.xpath(
            "//div[@class='comment-user-left date']/text()").extract()
        website_name = response.xpath(
            "//div[@id='line']/a[1]/img/@alt").extract()
        headings = response.xpath(
            "//div[@class='comments_user_comment']/a/text()").extract()
        ratings1 = response.xpath(
            "//div[@class='comment_user_star_rate']/div[@class='comment_user_stars']/img/@src"
        ).extract()
        if len(ratings1) == 0:
            ratings1 = response.xpath(
                "//div[@class='rating pure-u-1 pure-u-lg-1-3']/img[@class='stars overall']/@alt"
            ).extract()
        ratings = []
        while i < len(ratings1):
            ratings.append(getStarts(ratings1[i]))
            # print(getStarts(ratings1[i]))
            i = i + 1
        ratings = map(lambda foo: foo.replace('-', ''), ratings)
        ratings = map(lambda foo: foo.replace('.', ''), ratings)
        sum = 0
        ratings2 = []
        for i in range(len(ratings)):
            if i % 5 != 0 and i != 0:
                sum = sum + int(ratings[i])
            else:
                if i != 0:
                    c = sum / 5.0
                    ratings2.append(str(c))
                sum = 0
                sum = sum + int(ratings[i])

        c = sum / 5.0
        ratings2.append(str(c))
        for node in response.xpath('//div[@class="comment-body"]'):
            reviews.append(node.xpath('string()').extract())
        if len(reviews) == 0:
            for node in response.xpath(
                    '//div[@class="comment pure-u-1 pure-u-lg-2-3 wcc"]'):
                reviews.append(node.xpath('string()').extract())

        for item in range(0, len(reviews)):

            servicename1 = ServiceRecord(response.url, ratings2[item],
                                         headings[item], dates[item],
                                         authors[item], category, servicename,
                                         reviews[item], None, website_name)
            servicename1.save()
Beispiel #21
0
 def crawl(self, response,category,servicename):
     reviews = []
     # https://tbwhs.com/fatcow-web-hosting-reviews/
     for node in response.xpath(''):
         reviews.append(node.xpath('string()').extract());
     ratings =
     dates =
     headings =
     authors =
     website_name =
     for item in range(1, len(reviews)):
         servicename1 = ServiceRecord(response.url,ratings[item],headings[item],dates[item],authors[item],category,servicename,reviews[item],"",website_name);
         servicename1.save()
Beispiel #22
0
 def crawl(self, response, category, servicename):
     reviews = []
     for node in response.xpath('//div[@class="review_top"]/p'):
         reviews.append(node.xpath('string()').extract());
     headings = response.xpath("//div[@class='review']/div[@class='review_top']/span/h3/a/text()").extract()
     dates = response.xpath("//div[@class='review_details']/span/text()").extract()
     ratings = response.xpath("//div[@class='review_details']/div/div/a/text()").extract()
     authors = response.xpath("//div[@class='review_details']/span/strong/text()").extract()
     img_src = response.xpath("//div[@class='broker_img_container']/img/@src").extract()
     website_name = response.xpath("//div[@class='content'][1]/div[@class='top']/a[@class='logo']/@title").extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item], headings, dates[item], authors[item], category,
                       servicename, reviews[item], img_src,website_name);
         servicename1.save()
 def crawl(self, response, category, servicename):
     reviews = []
     print("Reviews from Resellerrating.com")
     # https://www.resellerratings.com/store/Nordvpn_com
     for node in  response.xpath("//div[@class='comment']/p[@class='review-body']/span"):
         reviews.append(node.xpath('string()').extract());
     headings =  response.xpath("//div[@class='comment']/p[@class='review-title']/span/text()").extract()
     dates =  response.xpath("//div[@class='comment']/div[@class='date fr']/span/text()").extract()
     ratings = response.xpath("//div[@class='rating siteStars fl']/span[@class='ratingLabel']/span[@class='bold']/text()").extract()
     authors =  response.xpath("//div[@class='user-column']/a[@class='rr-purple show-for-large']/text()").extract()
     website_name = response.xpath("//html/head/meta[15]/@content").extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item],headings[item], dates[item], authors[item], category,
                       servicename, reviews[item], website_name);
         servicename1.save()
 def crawl(self, response, category, servicename):
     self.category = category
     self.servicename = servicename
     reviews = []
     # https://reviews.financesonline.com/p/vyprvpn/
     for node in :
         reviews.append(node.xpath('string()').extract());
     dates =
     headings =
     authors =
     img_src =
     website_name =  
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category,
                       servicename, reviews[item],img_src,website_name)
         servicename1.save()
Beispiel #25
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        #print("review from webhostinghero.com")
        for node in response.xpath("//div[@class='box col-12 review-detail']"):
            reviews.append(node.xpath('string()').extract())
        ratings1 = response.xpath(
            "//div[@class='box col-12 review-title']/meta[@itemprop='ratingValue']/@content"
        ).extract()
        dates = response.xpath(
            "//div[@class='box col-12 review-info']/span[@class='review-date']/text()"
        ).extract()
        headings = response.xpath(
            "//div[@class='box col-12 review-title']/h4/text()").extract()
        authors = response.xpath(
            "//div[@class='box col-12 review-info']/strong/span/text()"
        ).extract()
        website_name = response.xpath(
            "//div[@class='wpcr3_item_name']/a/text()").extract()
        img_src = response.xpath("//div[@class='avatar']/img/@src").extract()
        ratings = []
        for i in range(len(ratings1)):
            c = int(ratings1[i]) / 2.0
            ratings.append(str(c))
        #print("Reviews ", len(reviews), reviews)
        #print("Headings ", len(headings), headings)
        #print("Authors ", len(authors), authors)
        #print("Rating ", len(ratings), ratings)
        #print("Dates ", len(dates), dates)
        #print("Img_src ", len(img_src), img_src)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, ratings[item],
                                         headings[item], dates[item],
                                         authors[item], category, servicename,
                                         reviews[item], img_src, website_name)
            servicename1.save()

        next_page = response.xpath(
            "//div[@class ='navigator']/a[7]/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url)
                # yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)
Beispiel #26
0
 def crawl(self, response, category, servicename):
     self.category = category
     self.servicename = servicename
     reviews = []
     # https://www.datingsitesreviews.com/staticpages/index.php?page=BlackPeopleMeet-Reviews&query=blackpeoplemeet
     for node in :
         reviews.append(node.xpath('string()').extract());
     temp_data = response.xpath("//div[@id='comments']/div[@class='block-comment-content level-0']/ul[@class='comment_status']/li[@class='comment_author']/text()").extract()
     dates =
     headings =  response.xpath("//div[@id='comments']/div[@class='block-comment-content level-0']/ul[@class='comment_status']/li[@class='comment_title']/text()").extract()
     authors =
     img_src =
     website_name =
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category,
                       servicename, reviews[item],img_src,website_name)
         servicename1.save()
Beispiel #27
0
 def crawl(self, response, category, servicename):
     reviews = []
     self.category = category
     self.servicename = servicename
     print("review from Consumeraffairs.com")
     # https://www.consumeraffairs.com/internet/godaddy.html
     for node in response.xpath(
             "//div[@class='campaign-reviews__regular-container js-campaign-reviews__regular-container']/div/div[@class='rvw-bd ca-txt-bd-2']/p"
     ):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class='stars-rtg stars-rtg--sm']/@data-rating").extract()
     ratings.pop(0)
     ratings.pop(0)
     ratings.pop(0)
     ratings.pop(0)
     ratings.pop(0)
     temp_dates = response.xpath(
         "//div[@class='rvw-bd ca-txt-bd-2']/span[@class='ca-txt-cpt ca-txt--clr-gray']/text()"
     ).extract()
     dates = []
     for date in temp_dates:
         dates.append(date.split(":")[1])
     authors = response.xpath(
         "//div[@class='rvw-aut']/div[@class='rvw-aut__inf']/strong[@class='rvw-aut__inf-nm']/text()"
     ).extract()
     website_name = response.xpath("//html/head/meta[3]/@content").extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, None, None, dates[item],
                                      authors[item], category, servicename,
                                      reviews[item], None, website_name)
         servicename1.save()
     next_page = response.xpath(
         "//div[@class='prf-lst']/nav[@class='prf-pgr js-profile-pager']/a[@class='ca-a-md "
         "ca-a-uprcs ca-a-blk prf-pgr__nxt js-profile-pager__next']/@href"
     ).extract()
     if next_page is not None:
         next_page_url = "".join(next_page)
         if next_page_url and next_page_url.strip():
             print(type(next_page_url))
             print(next_page_url, "    url")
             yield response.follow(url=next_page_url, callback=self.parsing)
Beispiel #28
0
 def crawl(self, response, category, servicename):
     reviews = []
     dates= []
     authors= []
     #TODO raiting negative--done
     # https://www.thevpnlab.com/reviews/nordvpn-review
     for node in response.xpath("//div[@class='ur-inner']/div[@class='user-review']"):
         reviews.append(node.xpath('string()').extract());
     date_authors = response.xpath("//div[@class='ur-inner']/div[@class='user-name']/text()").extract()
     for element in date_authors:
         authors.append(element.split("on")[0].split("By")[1])
         dates.append(element.split("on")[-1])
     ratings =  response.xpath("//div[@class='user-stars']/div/@id").extract()
     img_src =  response.xpath("//div[@class='introvoerview']/div[@id='introimg']/img/@src").extract()
     temp_data = response.xpath("//html/head/script[4]/text()").extract()
     website_name =  temp_data[0].split(",")[3].split(":")[1]
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item], None,dates[item], authors[item], category,
                       servicename, reviews[item],img_src,website_name);
         servicename1.save()
Beispiel #29
0
    def crawl(self, response, category, servicename):
        reviews = []
        self.category = category
        self.servicename = servicename
        print("review from 10bestonline.com")
        for node in response.xpath(
                "//div[@class='cust_review']/table/tbody/tr[5]/td[@class='comment']"
        ):
            reviews.append(node.xpath('string()').extract())
        # ratings = response.xpath("//div[@class='box col-12 review-title']/meta[@itemprop='ratingValue']/@content").extract()
        dates = response.xpath(
            "//div[@class='customer_reviews']/div/div[@class='cust_review']/table/tbody/tr[2]/td[@class='customer']/text()"
        ).extract()
        # headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract()
        authors = response.xpath(
            "//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']"
        ).extract()
        website_name = response.xpath(
            "//div[@class='wpcr3_item_name']/a/text()").extract()
        # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract()
        #print("Reviews ", len(reviews), reviews)
        # print("Headings ", len(headings), headings)
        #print("Authors ", len(authors), authors)
        # print("Rating ", len(ratings), ratings)
        #print("Dates ", len(dates), dates)
        # print("Img_src ", len(img_src), img_src)
        for item in range(0, len(reviews)):
            servicename1 = ServiceRecord(response.url, None, None, dates[item],
                                         authors[item], category, servicename,
                                         reviews[item], None, website_name)
            servicename1.save()

        next_page = response.xpath(
            "//div[@class ='navigator']/a[7]/@href").extract()
        if next_page is not None:
            next_page_url = "".join(next_page)
            if next_page_url and next_page_url.strip():
                print(type(next_page_url))
                print(next_page_url)
                # yield Request(url=next_page_url, callback=self.parse, dont_filter=True)
                yield response.follow(next_page_url, callback=self.parsing)
 def crawl(self, response, category, servicename):
     self.category = category
     self.servicename = servicename
     reviews = []
     # https://hostadvice.com/hosting-company/godaddy-reviews/
     for node in response.xpath('//div[@class="review-summary"]'):
         reviews.append(node.xpath('string()').extract())
     ratings = response.xpath(
         "//div[@class='review-rating clearfix']/span[@class='review-score']/text()"
     ).extract()
     headings = response.xpath(
         "//div[@class='review-content']/h3[@class='review_header']/text()"
     ).extract()
     authors1 = response.xpath("//div[@class='review-author']").extract()
     authors = []
     for content in authors1:
         root = etree.fromstring(content)
         for element in root:
             if (element.tag == 'strong'):
                 authors.append(element.text)
             else:
                 authors.append(element.xpath("//a/strong")[0].text)
     img_src = response.xpath(
         "//div[@class='col-md-offset-1 col-md-5 col-xs-6']/img[ @class='attachment-post-thumbnail size-post-thumbnail wp-post-image']/@src"
     ).extract()
     website_name = response.xpath(
         "//div[@class='location_info']/span[2]/span[1]/a[@class='home']/span/text()"
     ).extract()
     for item in range(0, len(reviews)):
         servicename1 = ServiceRecord(response.url, ratings[item],
                                      headings[item], None, authors[item],
                                      category, servicename, reviews[item],
                                      img_src, website_name)
         servicename1.save()
     next_page = response.xpath(
         "//div[@class='col-md-offset-2 col-md-4']/a[ @class ='orange_button']/@href"
     ).extract()
     if next_page is not None:
         next_page_url = " ".join(next_page)
         if next_page_url and next_page_url.strip():
             yield Request(url=next_page_url, callback=self.parsing)