def parse_contents(self, response): item = ReviewrItem() url = response.url platform = 'Android' title = response.xpath('//h1[@class="title"]/text()').extract()[0] game_name = title.replace('Review', '').replace('review', '') author = response.xpath('//a[@rel="author"]/text()').extract()[0] score = float( response.xpath('//div[@class="score"]/text()').extract()[-1]) raw_date = response.xpath( '//p[@class="date"]/span/text()').extract()[0] date = datetime.strptime(raw_date.strip(), '%b %d, %Y').date() conclusion_raw = response.xpath( '//div[@class="entry-content"]/p').extract()[-3:] encode_con = ''.join(conclusion_raw).encode('utf-8') soup_con = BeautifulSoup(encode_con, 'html.parser') conclusion = soup_con.get_text().strip() item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = conclusion print(item)
def parse_contents(self, response): item = ReviewrItem() date = response.xpath( '//meta[@name="sailthru.date"]/@content').extract()[0][:10] author = response.xpath('//a[@rel="author"]/text()').extract()[0] title = response.xpath( '//meta[@property="og:title"]/@content').extract()[0] name = title.replace(" Review", "") conclusion = response.xpath( '//meta[@property="og:description"]/@content').extract()[0] url = response.xpath( '//meta[@property="og:url"]/@content').extract()[0] platform = self.check_platform(url) score_raw = response.xpath( '//meta[@name="sailthru.tags"]/@content').extract()[0] score = score_raw.split()[3][:3].replace('-', '.') item['title'] = title item['date'] = date item['game'] = name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = conclusion print(item)
def parse_contents(self, response): item = ReviewrItem() score = float(response.xpath('//h3[@class="letter-grade"]/text()').extract()[0]) url = response.url title = response.xpath('//h1/text()').extract()[0] game_name = response.xpath('//h3[@class="actor-name"]/text()').extract()[0] author = response.xpath('//a[@class="auth-name"]/text()').extract()[0].strip() date = response.xpath('//time/@datetime').extract()[0] raw_conclusion = response.xpath('//div[@class="details"]').extract()[0] soup_con = BeautifulSoup(raw_conclusion, 'html.parser') conclusion = soup_con.get_text().strip() raw_pl = ', '.join(response.xpath('//p[@class="speakable-content"]').extract()[:2]) soup_pl = BeautifulSoup(raw_pl, 'html.parser') pl_string = soup_pl.get_text().strip().lower() find_pl = re.findall('platform(.*?)version reviewed', pl_string) if len(find_pl) == 0: find_pl = [pl_string] join_pl = ''.join(find_pl) platform = ### item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = conclusion print (item)
def parse_contents(self, response): item = ReviewrItem() url = response.url json_data = response.xpath( '//div/script[@type="application/ld+json"]/text()').extract()[1] load_data = json.loads(json_data) date = load_data['datePublished'] conclusion = response.xpath( '//p[@class="s16 b c3 lh27 fftext mar_rl4"]/text()').extract()[0] game_name = response.xpath( '//div[@class="dtc vab oh"]/a/strong/text()').extract()[0] title = response.xpath('//title/text()').extract()[0].strip() author = response.xpath('//a[@rel="author"]/text()').extract()[0] raw_platform = response.xpath( '//h2[@class="s18 as14_600 n"]/text()').extract()[0] platform = self.check_platform(raw_platform) raw_score = response.xpath( '//div[@id="val_ana_3"]/div[2]/span/text()').extract()[0].strip() score = float(raw_score.replace(',', '.')) item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = conclusion print(item)
def parse_contents(self, response): item = ReviewrItem() json_data = response.xpath( '//script[@type="application/ld+json"]/text()').extract()[1] data = json.loads(json_data.replace('\n', '')) title = response.xpath( '//h2[@class="text-big text-dark text-with-subtitle"]/text()' ).extract()[0] url_raw = response.xpath( '//meta[@property="og:url"]/@content').extract()[0] url = url_raw date = data['datePublished'] author = data['author']['name'] score = data['reviewRating']['ratingValue'] game_name = data['itemReviewed']['name'] platform = data['itemReviewed']['operatingSystem'] conclusion_raw = response.xpath( '//div[@itemprop="description"]').extract()[0] conclusion_encoded = ''.join(conclusion_raw).encode('utf-8') conclusion_soup = BeautifulSoup(conclusion_encoded, 'html.parser') conclusion = conclusion_soup.get_text().strip() item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = conclusion print(item)
def parse_contents(self, response): item = ReviewrItem() url = response.url title = response.xpath('//h2[@itemprop="name"]/text()').extract()[0] date = response.xpath('//meta[@itemprop="datePublished"]/@content').extract()[0][0:10] score = response.xpath('//span[@itemprop="ratingValue"]/text()').extract()[0] game_raw = title.encode('ascii', 'ignore').decode('utf8') game_name = game_raw.replace('Anlisis de ', '') author_raw = response.xpath('//span[@itemprop="author"]/span/text()').extract() if len(author_raw) == 0: author_raw = response.xpath('//span[@class="author"]/text()').extract() author = author_raw[0] platform_raw = response.xpath('//div[@class="header"]/p/a/text()').extract() platform = #function item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = None print (item)
def parse_contents(self, response): item = ReviewrItem() data = response.xpath( '//script[@type="application/ld+json"]/text()').extract() for info in data: if 'reviewRating' in info: load_info = json.loads(info) orig_score = float(load_info['reviewRating']['ratingValue']) score = orig_score * 2.0 url = response.url title = response.xpath('//h1/text()').extract()[0].strip() game_name = title author = response.xpath( '//div[@class="pageheader_byline"]/address/a/text()' ).extract()[0] date = response.xpath( '//meta[@property="article:published_time"]/@content' ).extract()[0] raw_platform = response.xpath( '//div[@class="categories_display"]/span/@tooltip' ).extract() platform = ', '.join(raw_platform) raw_conclusion = response.xpath( '//div[@class="review_box our_verdict"]/p').extract()[0] soup_con = BeautifulSoup(raw_conclusion, 'html.parser') conclusion = soup_con.get_text().strip() item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = orig_score item['score_critic'] = score item['conclusion'] = conclusion item['pub_id'] = 188 item['assign_to'] = 2 item['content'] = None print(item)
def parse_contents(self, response): item = ReviewrItem() title = response.xpath('//h1/a/text()').extract()[0] author = response.xpath( '//div[@class="postedby"]/a/text()').extract()[0] game_name = title.replace('review', '').replace('Review', '') platform = 'ios' url = response.url conclusion_raw = response.xpath( '//div[@class="body clearfloat"]/p').extract()[-1] encode_conclusion = ''.join(conclusion_raw) soup_conclusion = BeautifulSoup(encode_conclusion, 'html.parser') conclusion = soup_conclusion.get_text().strip() date_raw = response.xpath( '//div[@class="postedby"]/text()').extract()[-1] date = self.change_date(date_raw) score_raw = response.xpath( '//span[@class="rating"]/img/@src').extract() join_score = ''.join(score_raw) star = float(join_score.count('/star.png')) half = float(join_score.count('/halfstar.png') / 2.0) orig_score = star + half score = (star + half) * 2.0 item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = orig_score item['score_critic'] = score item['conclusion'] = conclusion print(item)
def parse_contents(self, response): item = ReviewrItem() platform = 'android' url = response.url title = response.xpath('//h1/text()').extract()[0] game_name = title author = response.xpath('//div[@class="td-post-author-name"]/a/text()').extract()[0] date = response.xpath('//span[@class="td-post-date"]/time/@datetime').extract()[0] conclusion = response.xpath('//div[@class="td-review-summary-content"]/text()').extract()[0] orig_score = float(response.xpath('//div[@class="td-review-final-score"]/text()').extract()[0]) score = orig_score * 2.0 item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = orig_score item['score_critic'] = score item['conclusion'] = conclusion print (item)
def parse_contents(self, response): item = ReviewrItem() title = response.xpath('//title/text()').extract()[0] author = response.xpath( '//a[@class="underline aa_text--bold"]/text()').extract()[0] date_raw = response.xpath( '//div[@class="aa_text--center aa_opacity--05 aa_margin-t--5"]/text()' ).extract()[0] date = datetime.strptime(date_raw, '%B %d, %Y') score = response.xpath( '//div[@class="r_c_rt_t_a"]/text()').extract()[0] game_name = response.xpath( '//div[@class="r_c_rf-app_img aa_position--relative"]/img/@alt' ).extract()[-1] url = response.url platform = 'ios' conclusion_raw = response.xpath( '//section[@id="top"]/div[5]/div[2]/div/p').extract() encode_con = ''.join(conclusion_raw).encode('utf-8') soup_con = BeautifulSoup(encode_con, 'html.parser') conclusion = soup_con.get_text().strip() item['title'] = title item['date'] = date item['game'] = game_name item['platform'] = platform item['author'] = author item['url'] = url item['score_orig'] = score item['score_critic'] = score item['conclusion'] = conclusion print(item)