def crawl(self, response, category, servicename): reviews = [] #http://pickuphost.com/review/bluehost/#customer_review_shap for node in response.xpath( "//div[@class='one_rew']/div[@class='rewiwer_post']/span"): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='col-md-12 avg_ratting_bg text-center']/div[@class='avg_ratting text-center']/text()" ).extract() headings = response.xpath( "//div[@id='rew_replace_div']/div[@class='one_rew']/h4/b/text()" ).extract() dates = response.xpath( "//div[@id='rew_replace_div']/div[@class='one_rew']/span[@class='rewiwer_data']/span[2]/text()" ).extract() authors = response.xpath( "//div[@id='rew_replace_div']/div[@class='one_rew']/span[@class='rewiwer_data']/span[1]/text()" ).extract() #TODO website name pending-- done name = response.xpath( "//div[@class='navbar-header']/a/@href").extract() website_name = name[0].split(".")[0].split("/")[-1] for item in range(1, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], "", website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from blackpeoplemeet.pissedconsumer.com") for node in response.xpath( "//div[@class='middleware-review-container'][1]/div/div[@class='f-component-info']/div[@class='f-component-text']/div[@class='overflow-text']" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock ']/div[@class='comment ']/ul[@class='postby']/li[2]/span[@class='smallStars']/@data-score" ).extract() dates = response.xpath( "div[@class='middleware-review-container']/div/div[@class='f-component-info']/div[@class='f-component-info-header']/time[@class='post-time secondary-info']/text()" ).extract() # headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract() # authors = response.xpath("//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']").extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract() #print("Reviews ", len(reviews), reviews) # print("Headings ", len(headings), headings) # print("Authors ", len(authors), authors) #print("Rating ", len(ratings), ratings) #print("Dates ", len(dates), dates) # print("Img_src ", len(img_src), img_src) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], None, dates[item], None, category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename # https://www.yelp.com/biz/fatcow-burlington for node in response.xpath('//div[@class="review-content"]'): reviews.append(node.xpath('string()').extract()) ratings = response.xpath("//div[@class='biz-rating biz-rating-large clearfix']/div/div/@title").extract() dates = response.xpath("//div[@class='biz-rating biz-rating-large clearfix']/span[@class='rating-qualifier']/text()").extract() authors = response.xpath("//div[@class='media-story']/ul[@class='user-passport-info']/li[@class='user-name']/a[@id='dropdown_user-name']/text()").extract() website_name = response.xpath("//html/head/meta[6]/@content").extract() print(" Ratings ", len(ratings), ratings) print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) # print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url,ratings[item],None,dates[item],authors[item],category,servicename,reviews[item],"",website_name); servicename1.save() next_page = response.xpath("//div[@class='arrange_unit']/a[@class='u-decoration-none next pagination-links_anchor']/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url, " url") yield response.follow(url=next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename authors = response.xpath( "//div[@class='comment-meta commentmetadata']/cite[@class='fn']/text()" ).extract() dates = response.xpath( "//div[@class='comment-time']/a/time/text()").extract() #print("Authors ", authors) #print("dates ", dates) for node in response.xpath("//div[@class='comment-text']/span"): reviews.append(node.xpath('string()').extract()) print("reviews ", reviews) img_src = response.xpath( "//div[@class='vcard-wrap']/img[@class='avatar avatar-100 wp-user-avatar wp-user-avatar-100 photo avatar-default']/@src" ).extract() # ratings = response.xpath("//div[@class='star_rating']/@title").extract() website_name = response.xpath("///html/head/title/text()").extract() #print("img_src ", img_src) #print("websitesName ", website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item], img_src, website_name) servicename1.save() next_page = response.xpath( "//div[@class='nav-previous']/a/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) yield response.follow(url=next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from restoreprivacy.com") # https://www.highya.com/coinbase-reviews for node in response.xpath("//div[@class='comment-text-inner']"): reviews.append(node.xpath('string()').extract()) # ratings = response.xpath("//div[@class='wpcr3_rating_style1_average']/@style").extract() dates = response.xpath( "//div[@class='comment-author vcard']/span[@class='ago']/text()" ).extract() # headings = response.xpath("//div[@class='width64 floatleft']/h4[3]").extract() authors = response.xpath( "//div[@class='comment-author vcard']/span[@class='fn']/span/text()" ).extract() img_src = response.xpath( "//div[@class='comment-author vcard']/img[@class='avatar avatar-50 photo']/@src" ).extract() website_name = response.xpath( "//div[@class='title-area']/p[@class='site-title']/a/text()" ).extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from productreview.com") # https://www.productreview.com.au/p/smart-fares.html #TODO date missing--done for node in response.xpath("//div[@class='review-overall']"): reviews.append(node.xpath('string()').extract()); ratings = response.xpath("//div[@class='rating-md']/p/span/span[@itemprop='ratingValue']/@content").extract() headings = response.xpath("//div[@class='review-content']/h3/text()").extract() dates = response.xpath("//div[@class='review-content']/div[@class='rating-md']/p/meta/@content").extract() authors = response.xpath("//div[@class='review-author']/h6/a/text()").extract() img_src = response.xpath("//div[@class='item-header-img']/span[@class='item-header-img-container']/img/@src").extract() website_name = response.xpath("/html/head/meta[7]/@content").extract() dates = response.xpath("//div[@class='review-content']/div[@class='rating-md']/p/meta/@content").extract() print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item],img_src,website_name); servicename1.save() next_page = response.xpath("//div[@class='pagination-container']/ul[@class='pagination']/li[7]/a/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) # yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] print("Reviews from Capterra.con") # https://www.capterra.com/p/170765/ExpressVPN/ for node in response.xpath( '//div[@class="review-comments color-text"]'): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='overall-rating-container']/span[@class='overall-rating']/span/text()" ).extract() headings = response.xpath( "//div[@class='cell seven-eighths palm-one-whole']/h3/q/text()" ).extract() dates = response.xpath( "//div[@class='grid']/div[@class='cell one-eighth palm-one-whole']/div[@class='quarter-margin-bottom micro color-gray weight-normal text-right palm-text-left']/text()" ).extract() img_src = response.xpath( "//div[@class='thumbnail no-hover listing-thumbnail']/img/@src" ).extract() website_name = response.xpath( "//div[@class='site-logo-wrapper']/a/img[@class='site-logo']/@alt" ).extract() for item in range(0, len(reviews)): service1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], None, category, servicename, reviews[item], img_src, website_name) service1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #print("whoishostingthis.com") # https://www.whoishostingthis.com/hosting-reviews/bluehost/ authors = response.xpath("//div[@class='author']/span[@class='name']/text()").extract() img_src = response.xpath("//div[@class='host-info wcc']/a[1]/img[@class=' logo']/@src").extract() website_name = response.xpath("//div[@class='mobile']/a[@class='home']/img[@class='logo']/@alt").extract() ratings1 = response.xpath("//div[@class='user-info pure-u-1']/img[@class='stars overall']/@alt").extract() if len(ratings1) == 0 : ratings1 = response.xpath("//div[@class='rating pure-u-1 pure-u-lg-1-3']/img[@class='stars overall']/@alt").extract() for node in response.xpath('//div[@class="comment pure-u-1 wcc"]'): reviews.append(node.xpath('string()').extract()); if len(reviews) == 0: for node in response.xpath('//div[@class="comment pure-u-1 pure-u-lg-2-3 wcc"]'): reviews.append(node.xpath('string()').extract()); #print(" reviews ", reviews) dates = response.xpath("//div[@class='user-info pure-u-1']/time[@class='published']/text()").extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings1[item], None, None, authors[item], category, servicename, reviews[item],img_src,website_name); servicename1.save() next_page = response.xpath("//div[@class ='see-more']/a/@ href").extract() if len(next_page) == 0: next_page = response.xpath("//div[@class ='pure-u-1 pure-u-lg-1-4 next']/a/@ href").extract() if next_page is not None: next_page_url ="".join(next_page) if next_page_url and next_page_url.strip(): #print(type(next_page_url)) #print(next_page_url) #yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] print("review from Hostingfacts.com") # https://hostingfacts.com/hosting-reviews/hostgator-wordpress-managed/ for node in response.xpath('//div[@class="user-review-content"]'): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class= 'user-review']/header/section/span[@class='user-review-rating']/span[@class='value']/text()" ).extract() dates = response.xpath( "//div[@class= 'user-review']/header/section/span[@class='user-review-meta']/text()" ).extract() headings = response.xpath( "//div[@class= 'user-review']/section/p[@class='user-review-title']/text()" ).extract() authors = response.xpath( "//div[@class='user-review']/header/section/p[@class='user-review-name']/a/span/text()" ).extract() img_src = response.xpath( "//div[@class='sidebar-padder']/aside/img[@class='img-responsive banner-image center-block']/@src" ).extract() website_name = response.xpath( "//div[@class='navbar-header']/a[@class='navbar-brand']/text()" ).extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], img_src, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from macupdate.com") for node in response.xpath("//div/div[@class='yui3-u rcpb-content']/p[@class='rcpb-revcom-content']"): reviews.append(node.xpath('string()').extract()) ratings = response.xpath("//div/div[@class='yui3-u rcpb-content']/div/input/@value/@text()").extract() dates = response.xpath("//div/div[@class='yui3-u rcpb-content']/span[@class='rcpb-postdate']/text()").extract() headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract() authors = response.xpath("//div[@class='box col-12 review-info']/strong/span/text()").extract() website_name = response.xpath("//div[@class='wpcr3_item_name']/a/text()").extract() img_src = response.xpath("//div[@class='avatar']/img/@src").extract() print(" raaaaaa") for i in range(len(ratings)): if i != 0: del [ratings] for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], img_src, website_name) servicename1.save() next_page = response.xpath("//div[@class ='navigator']/a[7]/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) # yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #TODO raiting coming in percentage # https://top11hosting.com/hostgator-review/ for node in response.xpath( "//div[@class='wpcr3_item wpcr3_business']/div/blockquote[@class='wpcr3_content']" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='wpcr3_rating_style1_average']/@style").extract() ratings.pop(0) ratings.pop(0) dates = response.xpath( "//div[@class='wpcr3_review_datePublished']/text()").extract() # headings = response.xpath("//div[@class='width64 floatleft']/h4[3]").extract() authors = response.xpath( "//div[@class='wpcr3_review_author']/span[@class='wpcr3_caps']/text()" ).extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() ratings1 = [] i = 0 while i < len(ratings): c = int(getStarts(ratings[i])) / 20 ratings1.append(str(c)) i = i + 1 for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings1[item], None, dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename # https://www.sitejabber.com/reviews/zoosk.com for node in response.xpath('//div[@class="review "]/p'): reviews.append(node.xpath('string()').extract()); ratings = response.xpath("//div[@class='star_rating']/@title").extract() dates = response.xpath("//div[@class='time tiny_text faded_text']/text()").extract() headings = response.xpath("//div[@class='review_title']/a/text()").extract() authors1 = response.xpath("//div[@class='author_name']").extract() authors = [] for content in authors1: root = etree.fromstring(content) if(root.text == None ): for element in root: authors.append(element.text) else: authors.append(root.text) website_name = response.xpath("//div[@id='header_top']/a[@id='header_logo']/picture/img/@alt").extract() #print(authors) for item in range(0, len(reviews)): servicename1 =ServiceRecord(response.url, ratings[item],headings[item], dates[item], authors[item], category, servicename, reviews[item], None,website_name); servicename1.save() next_page = response.xpath("// div[ @class ='paginator_next']/span/a[@class ='button outline']/@href").extract() if next_page is not None: next_page_url ="".join(next_page) if next_page_url and next_page_url.strip(): #print(type(next_page_url)) #print(next_page_url) yield response.follow(url=next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] # https://webhostinggeeks.com/providers/hostgator?product=shared for node in response.xpath('//div[@class="text_description"]'): reviews.append(node.xpath('string()').extract()) dates = response.xpath( "//div[@class='top_line']/span/text()").extract() headings = response.xpath( "//div[@class='info_description']/p[@class='title_description ']/a/text()" ).extract() authors = response.xpath( "//div[@class='user-text']/p/text()").extract() website_name = response.xpath("/html/head/meta[9]/@content").extract() #print("Reviews ", len(reviews), reviews) #print("Headings ", len(headings), headings) #print("Authors ", len(authors), authors) # print("Rating ", len(ratings), ratings) #print("Dates ", len(dates), dates) # print("Img_src ", len(img_src), img_src) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, headings[item], dates[item], authors[item], category, servicename, reviews[item], "", website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from vpnmentor.com") # https://www.highya.com/coinbase-reviews for node in response.xpath( "//div[@class='review-item style_prevu_kit ']/div[@class='review-content']/p" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath("//div[@class='rate']/ul/li/text()").extract() dates = response.xpath( "//div[@class='row']/div[@class='col-md-4 col-xs-5']/div[@class='user']/div[@class='text-wrap']/h6/text()" ).extract() headings = response.xpath( "//div[@class='row']/div[@class='col-md-6 col-md-pull-2 col-xs-12']/div[@class='topic']/span/text()" ).extract() authors = response.xpath( "//div[@class='text-wrap']/h5/text()").extract() # img_src = response.xpath("//div[@class='img-wrap']/div/").extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() print(" Ratings ", len(ratings), ratings) print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) # print("img_Src ", len(img_src), img_src) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from seniordatingsites.com") # https://www.highya.com/coinbase-reviews for node in response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-review']"): reviews.append(node.xpath('string()').extract()); ratings1 = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-stars']/img/@src").extract() i = 0 ratings = [] while i < len(ratings1): star = getStarts(ratings1[i]) ratings.append(str(star)) i = i + 1 ratings = map(lambda foo: foo.replace('.', ''), ratings) # dates = response.xpath("//div[@class='review-sub-cntnr']/div[@class='review-one-all']/div[@class='review-profile']/div[@class='review-mid']/p/text()").extract() # img_src = response.xpath("//div[@class='logo-profile']/img/@src").extract() authors = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-name']/text()").extract() website_name = response.xpath("//div[@id='container']/div[@id='header']/div[@class='left eight columns']/div/a[@class='logo']/img/@title").extract() print(" Ratings ", len(ratings), ratings) # print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) # print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], None, None, authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #print("review from vpnranks.com") # https://www.highya.com/coinbase-reviews for node in response.xpath("//div[@class='comment-body']"): reviews.append(node.xpath('string()').extract()) # ratings = response.xpath("//div[@class='rate']/ul/li/text()").extract() dates = response.xpath( "//div[@class='comment-meta commentmetadata']/a/text()").extract() # headings = response.xpath("//div[@class='row']/div[@class='col-md-6 col-md-pull-2 col-xs-12']/div[@class='topic']/span/text()").extract() authors1 = response.xpath( "//div[@class='comment-author vcard']").extract() authors = [] for content in authors1: root = etree.fromstring(content) if (root.text == None): for element in root: authors.append(element.text) else: for element in root: authors.append(element.text) for i in range(len(authors) / 2 + 1): if i != 0: del authors[i] website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from hostingcharges.in") # https://www.highya.com/coinbase-reviews for node in response.xpath( "//div[@class='review-cntnr']/div[@class='review-sub-cntnr']/div[@class='review-one-all']/p" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='review-one-all']/div[@class='lftfeatures']/div/div/input/@value" ).extract() headings = response.xpath( "//div[@class='review-mid']/p/text()").extract() #TODO code pending giving error ratings1 = [] for i in range(len(ratings)): if i % 4 != 0 and i != 0: sum = sum + int(ratings[i]) else: if i != 0: c = sum / 4.0 ratings1.append(str(c)) sum = 0 sum = sum + int(ratings[i]) c = sum / 4.0 ratings1.append(str(c)) dates = response.xpath( "//div[@class='review-sub-cntnr']/div[@class='review-one-all']/div[@class='review-profile']/div[@class='review-mid']/p/text()" ).extract() img_src = response.xpath( "//div[@class='logo-profile']/img/@src").extract() authors = response.xpath( "//div[@class='review-mid']/h4/text()").extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() print(" Ratings ", len(ratings1), ratings1) reviews = [[s.strip() for s in nested] for nested in reviews] i = 0 count = 0 while i < len(reviews): if reviews[i][0] == '': del reviews[i] count = count + 1 i = i + 1 print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings1[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], img_src, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from HighYa.com") # https://www.highya.com/coinbase-reviews for node in response.xpath( "//div[@class='left-col col-lg-8 col-lg']/div[@id='reviews']/ul[@class='no-list list-review']/li/span/div[@class='description']" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/span[@class='review']/meta[@itemprop='ratingValue']/@content" ).extract() dates = response.xpath( "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/ul[@class='list-line options']/li[last()-1]/text()" ).extract() headings = response.xpath( "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/h3[@class='title']/text()" ).extract() #TODO some times auther name structure differ not anchor tag need to check # authors = response.xpath("//div[@id='reviews']/ul[@class='no-list list-review']/li/span/ul[@class='list-line options']/li[1]/a/span/text()").extract() authors1 = response.xpath( "//div[@id='reviews']/ul[@class='no-list list-review']/li/span/ul[@class='list-line options']/li[1]" ).extract() authors = [] for content in authors1: print(content) root = etree.fromstring(content) print("rootttttt ", root.text) break # if (root.text == None): # for element in root: # authors.append(element.text) # else: # authors.append(root.text) website_name = response.xpath("//html/head/meta[7]/@content").extract() print(" Ratings ", len(ratings), ratings) print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save() next_page = response.xpath( "//div[@class='pagination']/a[@class='next']/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url, " url") yield response.follow(url=next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from yscam.com") for node in response.xpath( "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock ']/div[@class='comment ']/div" ): reviews.append(node.xpath('string()').extract()) reviews = [[s.strip() for s in nested] for nested in reviews] i = 0 count = 0 while i < len(reviews): if reviews[i][0] == 'Mark as Useful' or reviews[i][ 0] == 'Post Reply': del reviews[i] count = count + 1 while (i < len(reviews) and (reviews[i][0] == 'Mark as useful' or reviews[i][0] == 'Post Reply')): del reviews[i] count = count + 1 i = i + 1 ratings = response.xpath( "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock ']/div[@class='comment ']/ul[@class='postby']/li[2]/span[@class='smallStars']/@data-score" ).extract() dates = response.xpath( "//body/section[@class='row body inside']/section[@class='comments-block']/section[@class='commentblock ']/div[@class='comment ']/ul[@class='postby']/li[1]/text()" ).extract() # headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract() # authors = response.xpath("//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']").extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract() print("Reviews ", len(reviews), reviews) # print("Headings ", len(headings), headings) # print("Authors ", len(authors), authors) print("Rating ", len(ratings), ratings) print("Dates ", len(dates), dates) # print("Img_src ", len(img_src), img_src) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], None, dates[item], None, category, servicename, reviews[item], None, website_name) servicename1.save() next_page = response.xpath( "//div[@class ='navigator']/a[7]/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) # yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #print("webshostingFatcow.com") authors = response.xpath( "//div[@class='comment-user-left name']/text()").extract() dates = response.xpath( "//div[@class='comment-user-left date']/text()").extract() website_name = response.xpath( "//div[@id='line']/a[1]/img/@alt").extract() headings = response.xpath( "//div[@class='comments_user_comment']/a/text()").extract() ratings1 = response.xpath( "//div[@class='comment_user_star_rate']/div[@class='comment_user_stars']/img/@src" ).extract() if len(ratings1) == 0: ratings1 = response.xpath( "//div[@class='rating pure-u-1 pure-u-lg-1-3']/img[@class='stars overall']/@alt" ).extract() ratings = [] while i < len(ratings1): ratings.append(getStarts(ratings1[i])) # print(getStarts(ratings1[i])) i = i + 1 ratings = map(lambda foo: foo.replace('-', ''), ratings) ratings = map(lambda foo: foo.replace('.', ''), ratings) sum = 0 ratings2 = [] for i in range(len(ratings)): if i % 5 != 0 and i != 0: sum = sum + int(ratings[i]) else: if i != 0: c = sum / 5.0 ratings2.append(str(c)) sum = 0 sum = sum + int(ratings[i]) c = sum / 5.0 ratings2.append(str(c)) for node in response.xpath('//div[@class="comment-body"]'): reviews.append(node.xpath('string()').extract()) if len(reviews) == 0: for node in response.xpath( '//div[@class="comment pure-u-1 pure-u-lg-2-3 wcc"]'): reviews.append(node.xpath('string()').extract()) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings2[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response,category,servicename): reviews = [] # https://tbwhs.com/fatcow-web-hosting-reviews/ for node in response.xpath(''): reviews.append(node.xpath('string()').extract()); ratings = dates = headings = authors = website_name = for item in range(1, len(reviews)): servicename1 = ServiceRecord(response.url,ratings[item],headings[item],dates[item],authors[item],category,servicename,reviews[item],"",website_name); servicename1.save()
def crawl(self, response, category, servicename): reviews = [] for node in response.xpath('//div[@class="review_top"]/p'): reviews.append(node.xpath('string()').extract()); headings = response.xpath("//div[@class='review']/div[@class='review_top']/span/h3/a/text()").extract() dates = response.xpath("//div[@class='review_details']/span/text()").extract() ratings = response.xpath("//div[@class='review_details']/div/div/a/text()").extract() authors = response.xpath("//div[@class='review_details']/span/strong/text()").extract() img_src = response.xpath("//div[@class='broker_img_container']/img/@src").extract() website_name = response.xpath("//div[@class='content'][1]/div[@class='top']/a[@class='logo']/@title").extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings, dates[item], authors[item], category, servicename, reviews[item], img_src,website_name); servicename1.save()
def crawl(self, response, category, servicename): reviews = [] print("Reviews from Resellerrating.com") # https://www.resellerratings.com/store/Nordvpn_com for node in response.xpath("//div[@class='comment']/p[@class='review-body']/span"): reviews.append(node.xpath('string()').extract()); headings = response.xpath("//div[@class='comment']/p[@class='review-title']/span/text()").extract() dates = response.xpath("//div[@class='comment']/div[@class='date fr']/span/text()").extract() ratings = response.xpath("//div[@class='rating siteStars fl']/span[@class='ratingLabel']/span[@class='bold']/text()").extract() authors = response.xpath("//div[@class='user-column']/a[@class='rr-purple show-for-large']/text()").extract() website_name = response.xpath("//html/head/meta[15]/@content").extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item],headings[item], dates[item], authors[item], category, servicename, reviews[item], website_name); servicename1.save()
def crawl(self, response, category, servicename): self.category = category self.servicename = servicename reviews = [] # https://reviews.financesonline.com/p/vyprvpn/ for node in : reviews.append(node.xpath('string()').extract()); dates = headings = authors = img_src = website_name = for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item],img_src,website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #print("review from webhostinghero.com") for node in response.xpath("//div[@class='box col-12 review-detail']"): reviews.append(node.xpath('string()').extract()) ratings1 = response.xpath( "//div[@class='box col-12 review-title']/meta[@itemprop='ratingValue']/@content" ).extract() dates = response.xpath( "//div[@class='box col-12 review-info']/span[@class='review-date']/text()" ).extract() headings = response.xpath( "//div[@class='box col-12 review-title']/h4/text()").extract() authors = response.xpath( "//div[@class='box col-12 review-info']/strong/span/text()" ).extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() img_src = response.xpath("//div[@class='avatar']/img/@src").extract() ratings = [] for i in range(len(ratings1)): c = int(ratings1[i]) / 2.0 ratings.append(str(c)) #print("Reviews ", len(reviews), reviews) #print("Headings ", len(headings), headings) #print("Authors ", len(authors), authors) #print("Rating ", len(ratings), ratings) #print("Dates ", len(dates), dates) #print("Img_src ", len(img_src), img_src) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], img_src, website_name) servicename1.save() next_page = response.xpath( "//div[@class ='navigator']/a[7]/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) # yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): self.category = category self.servicename = servicename reviews = [] # https://www.datingsitesreviews.com/staticpages/index.php?page=BlackPeopleMeet-Reviews&query=blackpeoplemeet for node in : reviews.append(node.xpath('string()').extract()); temp_data = response.xpath("//div[@id='comments']/div[@class='block-comment-content level-0']/ul[@class='comment_status']/li[@class='comment_author']/text()").extract() dates = headings = response.xpath("//div[@id='comments']/div[@class='block-comment-content level-0']/ul[@class='comment_status']/li[@class='comment_title']/text()").extract() authors = img_src = website_name = for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item],img_src,website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from Consumeraffairs.com") # https://www.consumeraffairs.com/internet/godaddy.html for node in response.xpath( "//div[@class='campaign-reviews__regular-container js-campaign-reviews__regular-container']/div/div[@class='rvw-bd ca-txt-bd-2']/p" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='stars-rtg stars-rtg--sm']/@data-rating").extract() ratings.pop(0) ratings.pop(0) ratings.pop(0) ratings.pop(0) ratings.pop(0) temp_dates = response.xpath( "//div[@class='rvw-bd ca-txt-bd-2']/span[@class='ca-txt-cpt ca-txt--clr-gray']/text()" ).extract() dates = [] for date in temp_dates: dates.append(date.split(":")[1]) authors = response.xpath( "//div[@class='rvw-aut']/div[@class='rvw-aut__inf']/strong[@class='rvw-aut__inf-nm']/text()" ).extract() website_name = response.xpath("//html/head/meta[3]/@content").extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save() next_page = response.xpath( "//div[@class='prf-lst']/nav[@class='prf-pgr js-profile-pager']/a[@class='ca-a-md " "ca-a-uprcs ca-a-blk prf-pgr__nxt js-profile-pager__next']/@href" ).extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url, " url") yield response.follow(url=next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): reviews = [] dates= [] authors= [] #TODO raiting negative--done # https://www.thevpnlab.com/reviews/nordvpn-review for node in response.xpath("//div[@class='ur-inner']/div[@class='user-review']"): reviews.append(node.xpath('string()').extract()); date_authors = response.xpath("//div[@class='ur-inner']/div[@class='user-name']/text()").extract() for element in date_authors: authors.append(element.split("on")[0].split("By")[1]) dates.append(element.split("on")[-1]) ratings = response.xpath("//div[@class='user-stars']/div/@id").extract() img_src = response.xpath("//div[@class='introvoerview']/div[@id='introimg']/img/@src").extract() temp_data = response.xpath("//html/head/script[4]/text()").extract() website_name = temp_data[0].split(",")[3].split(":")[1] for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], None,dates[item], authors[item], category, servicename, reviews[item],img_src,website_name); servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from 10bestonline.com") for node in response.xpath( "//div[@class='cust_review']/table/tbody/tr[5]/td[@class='comment']" ): reviews.append(node.xpath('string()').extract()) # ratings = response.xpath("//div[@class='box col-12 review-title']/meta[@itemprop='ratingValue']/@content").extract() dates = response.xpath( "//div[@class='customer_reviews']/div/div[@class='cust_review']/table/tbody/tr[2]/td[@class='customer']/text()" ).extract() # headings = response.xpath("//div[@class='box col-12 review-title']/h4/text()").extract() authors = response.xpath( "//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']" ).extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract() #print("Reviews ", len(reviews), reviews) # print("Headings ", len(headings), headings) #print("Authors ", len(authors), authors) # print("Rating ", len(ratings), ratings) #print("Dates ", len(dates), dates) # print("Img_src ", len(img_src), img_src) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, None, None, dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save() next_page = response.xpath( "//div[@class ='navigator']/a[7]/@href").extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) # yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)
def crawl(self, response, category, servicename): self.category = category self.servicename = servicename reviews = [] # https://hostadvice.com/hosting-company/godaddy-reviews/ for node in response.xpath('//div[@class="review-summary"]'): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='review-rating clearfix']/span[@class='review-score']/text()" ).extract() headings = response.xpath( "//div[@class='review-content']/h3[@class='review_header']/text()" ).extract() authors1 = response.xpath("//div[@class='review-author']").extract() authors = [] for content in authors1: root = etree.fromstring(content) for element in root: if (element.tag == 'strong'): authors.append(element.text) else: authors.append(element.xpath("//a/strong")[0].text) img_src = response.xpath( "//div[@class='col-md-offset-1 col-md-5 col-xs-6']/img[ @class='attachment-post-thumbnail size-post-thumbnail wp-post-image']/@src" ).extract() website_name = response.xpath( "//div[@class='location_info']/span[2]/span[1]/a[@class='home']/span/text()" ).extract() for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], headings[item], None, authors[item], category, servicename, reviews[item], img_src, website_name) servicename1.save() next_page = response.xpath( "//div[@class='col-md-offset-2 col-md-4']/a[ @class ='orange_button']/@href" ).extract() if next_page is not None: next_page_url = " ".join(next_page) if next_page_url and next_page_url.strip(): yield Request(url=next_page_url, callback=self.parsing)