def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from seniordatingsites.com") # https://www.highya.com/coinbase-reviews for node in response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-review']"): reviews.append(node.xpath('string()').extract()); ratings1 = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-stars']/img/@src").extract() i = 0 ratings = [] while i < len(ratings1): star = getStarts(ratings1[i]) ratings.append(str(star)) i = i + 1 ratings = map(lambda foo: foo.replace('.', ''), ratings) # dates = response.xpath("//div[@class='review-sub-cntnr']/div[@class='review-one-all']/div[@class='review-profile']/div[@class='review-mid']/p/text()").extract() # img_src = response.xpath("//div[@class='logo-profile']/img/@src").extract() authors = response.xpath("//div[@id='main-inner']/ul[@id='user-reviews']/li/div[@class='userrev']/div[@class='user-name']/text()").extract() website_name = response.xpath("//div[@id='container']/div[@id='header']/div[@class='left eight columns']/div/a[@class='logo']/img/@title").extract() print(" Ratings ", len(ratings), ratings) # print("dates ", len(dates), dates) print(" Reviews ", len(reviews), reviews) # print(" headings ", len(headings), headings) print(" authors ", len(authors), authors) print(" website_name ", len(website_name), website_name) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings[item], None, None, authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #TODO raiting coming in percentage # https://top11hosting.com/hostgator-review/ for node in response.xpath( "//div[@class='wpcr3_item wpcr3_business']/div/blockquote[@class='wpcr3_content']" ): reviews.append(node.xpath('string()').extract()) ratings = response.xpath( "//div[@class='wpcr3_rating_style1_average']/@style").extract() ratings.pop(0) ratings.pop(0) dates = response.xpath( "//div[@class='wpcr3_review_datePublished']/text()").extract() # headings = response.xpath("//div[@class='width64 floatleft']/h4[3]").extract() authors = response.xpath( "//div[@class='wpcr3_review_author']/span[@class='wpcr3_caps']/text()" ).extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() ratings1 = [] i = 0 while i < len(ratings): c = int(getStarts(ratings[i])) / 20 ratings1.append(str(c)) i = i + 1 for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings1[item], None, dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename #print("webshostingFatcow.com") authors = response.xpath( "//div[@class='comment-user-left name']/text()").extract() dates = response.xpath( "//div[@class='comment-user-left date']/text()").extract() website_name = response.xpath( "//div[@id='line']/a[1]/img/@alt").extract() headings = response.xpath( "//div[@class='comments_user_comment']/a/text()").extract() ratings1 = response.xpath( "//div[@class='comment_user_star_rate']/div[@class='comment_user_stars']/img/@src" ).extract() if len(ratings1) == 0: ratings1 = response.xpath( "//div[@class='rating pure-u-1 pure-u-lg-1-3']/img[@class='stars overall']/@alt" ).extract() ratings = [] while i < len(ratings1): ratings.append(getStarts(ratings1[i])) # print(getStarts(ratings1[i])) i = i + 1 ratings = map(lambda foo: foo.replace('-', ''), ratings) ratings = map(lambda foo: foo.replace('.', ''), ratings) sum = 0 ratings2 = [] for i in range(len(ratings)): if i % 5 != 0 and i != 0: sum = sum + int(ratings[i]) else: if i != 0: c = sum / 5.0 ratings2.append(str(c)) sum = 0 sum = sum + int(ratings[i]) c = sum / 5.0 ratings2.append(str(c)) for node in response.xpath('//div[@class="comment-body"]'): reviews.append(node.xpath('string()').extract()) if len(reviews) == 0: for node in response.xpath( '//div[@class="comment pure-u-1 pure-u-lg-2-3 wcc"]'): reviews.append(node.xpath('string()').extract()) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings2[item], headings[item], dates[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save()
def str11(self): store_data_dict = {} # datetime.datetime(2010, 2, 15, 0, 0) return { "absolute_url": self.url, "rating": utils.getStarts(self.rating), "review_title": self.heading, "reviewed_at": utils.convertDate(self.date), "reviewer_name": self.author, "category": self.category, "service_name": self.service_name, "review_text": self.reviews[0], "picture_urls": self.img_src, "website_name":self.website_name}
def crawl(self, response, category, servicename): reviews = [] self.category = category self.servicename = servicename print("review from blackpeoplemeet.pissedconsumer.com") for node in response.xpath( "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/div" ): reviews.append(node.xpath('string()').extract()) ratings1 = response.xpath( "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/font/img/@src" ).extract() dates1 = response.xpath( "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/font/span[1]/text()" ).extract() headings = response.xpath( "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='latest_reviews_a']/div[@class='latest_reviews_content']/font/a/text()" ).extract() # authors = response.xpath("//div[@class='cust_review']/table/tbody/tr[3]/td[@class='customer']").extract() website_name = response.xpath( "//div[@class='wpcr3_item_name']/a/text()").extract() # img_src = response.xpath("//div[@id='comments']/ul[@class='comment-list']/li/article/footer[@class='comment-meta']/div[@class='comment-author vcard']/img[@class='avatar avatar-74 photo']/@src").extract() sum = 0 c = 0 ratings = [] dates1 = map(lambda foo: foo.replace(u'\xa0', u''), dates1) dates1 = map(lambda foo: foo.replace('By', ''), dates1) dates = [] dates2 = [] authors = [] authorsCount = 0 # authors = map(lambda foo: foo.replace('By', ''), authors) while authorsCount < len(dates1): authorDetails = dates1[authorsCount].split('|') authors.append(authorDetails[0]) dates.append(authorDetails[1]) authorsCount = authorsCount + 1 authorsCount = 0 while authorsCount < len(dates): authorDetails = dates1[authorsCount].split(' ') dates2.append(authorDetails[1]) authorsCount = authorsCount + 1 i = 0 while i < len(ratings1): star = getStarts(ratings1[i]) if (star == '01.'): ratings.append(5) else: ratings.append(0) i = i + 1 ratings2 = [] for i in range(len(ratings)): if i % 5 != 0 and i != 0: sum = sum + int(ratings[i]) else: if i != 0: c = sum / 5.0 ratings2.append(str(round(c, 2))) sum = 0 sum = sum + int(ratings[i]) c = sum / 5.0 ratings2.append(str(round(c, 2))) for item in range(0, len(reviews)): servicename1 = ServiceRecord(response.url, ratings2[item], headings[item], dates2[item], authors[item], category, servicename, reviews[item], None, website_name) servicename1.save() next_page = response.xpath( "//div[@class='mid_left']/div[@class='mid_left_site']/div[@class='viciao']/div[@class='page_cut']/a[@class='cur_pageva']/@href" ).extract() if next_page is not None: next_page_url = "".join(next_page) if next_page_url and next_page_url.strip(): print(type(next_page_url)) print(next_page_url) # yield Request(url=next_page_url, callback=self.parse, dont_filter=True) yield response.follow(next_page_url, callback=self.parsing)