def parse_reviews(self, response): """ Parse review page """ data = json.loads(response.text) if ('entities' not in data) or ('RATING' not in data['entities']): return ratings = data['entities']['RATING'] for review in data['entities']['REVIEWS'].values(): yield ScraperItem( title="", text=review['reviewText'], rating=ratings[str(review['rating']['entities'][0] ['entity_ids'][0])]['rating'], source="Zomato", ) if data['page_data']['sections']['SECTION_REVIEWS'][ 'numberOfPages'] > response.meta['page']: product_id = response.meta['id'] next_page = response.meta['page'] + 1 url = self.review_url(id, product_id) yield scrapy.Request(url, callback=self.parse_reviews, meta={ 'page': next_page, 'id': product_id })
def parse_page(self, response): # use scrapy shell to find xpath # from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( "//div[@class='asset-header-content-inner']/h2/a/text()" ).extract()[0] except IndexError: item['title'] = "" item['text'] = " ".join(response.xpath( "//div[@class='asset-body']/child::node()").extract()) try: item['date'] = response.xpath( "//abbr[@class='datetime']/text()").extract()[0] except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( "//ul[@class='asset-meta-list']/li[1]/a/text()").extract()[0] except IndexError: item["comment_count"] = "0" yield item
def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( "//dt[@class='entry-title']/text()").extract()[0] except IndexError: item['title'] = "" item['text'] = " ".join( response.xpath( "//div[@class='entry-content']/child::node()").extract()) try: item['date'] = response.xpath( "//abbr[@class='updated']/text()").extract()[0] except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( "//span[@class='comments-count']/text()").extract()[0] except IndexError: item["comment_count"] = "0" yield item
def parse_page(self, response): # use scrapy shell to find xpath # from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item['url'] = response.url try: #item['title'] = response.xpath("//div[@class='BlogTopic']/a/text()").extract()[0] item['title'] = response.xpath("//div[@id='article']/p/b/text()").extract()[0] except IndexError: item['title'] = "" #item['text'] = " ".join(response.xpath("//div[@id='comm0']/child::node()").extract()) #item['date'] = response.xpath("//td[@class='BlogSmall']/a/text()").extract()[0] #item['comment_count'] = str(len(response.xpath("//table[@id='MainPage']/tr/td/table[4]/tr").extract()) - 1) item['text'] = " ".join(response.xpath("//div[@id='article']/p").extract()) try: item['date'] = response.xpath("//div[@id='article']/table/tr[1]/td[1]/span[1]/table/tr[1]/td[2]/p/text()").extract()[0].split(":")[1] except IndexError: item['date'] = "" try: item['comment_count'] = int(response.xpath("//div[@id='article']/table/tr[1]/td[1]/span[1]/table/tr[1]/td[1]/a/text()").extract()[0].split("(")[1].split(")")[0]) except IndexError: item['comment_count'] = "" yield item
def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response #inspect_response(response) item = ScraperItem() item["url"] = response.url item["date"] = response.xpath( "//p[@class='entry-footer']/text()").extract()[0] item["text"] = " ".join( response.xpath( "//div[@class='entry-body']/child::node()").extract()) try: item["title"] = response.xpath( "//h3[@class='entry-header']/text()").extract()[0] except IndexError: item["title"] = "" try: item["comment_count"] = response.xpath( "//p[@class='entry-footer']/a[3]/text()").extract()[0] except IndexError: item["comment_count"] = "0" yield item
def extract_page(self, response): item_content = response.meta.get('item') item_fields_page = self.settings.get('ITEM_FIELDS_PAGE', []) item = ScraperItem() for item_field in item_fields_page: field = item_field.get('field', '') selector = item_field.get('selector', '') method = item_field.get('method', '') if not selector or not field: continue extract_value = response.css(selector).getall() if method == 'joinlink': extract_value = self.extract_links(response, extract_value) elif method == 'joinlink': extract_value = ' '.join(extract_value) else: extract_value[0] item_content['content'][field] = extract_value item = item_content yield item
def parse_reviews(self, response): """ Parse review page """ data = json.loads(response.text) if 'Results' not in data: return for review in data['Results']: yield ScraperItem( title=review['Title'], text=review['ReviewText'], rating=review['Rating'], ) if data['TotalResults'] > response.meta['page'] * 5: product_id = response.meta['id'] next_page = response.meta['page'] + 1 url = self.review_url(id, next_page) yield scrapy.Request(url, callback=self.parse_reviews, meta={ 'page': next_page, 'id': product_id })
def parse_item(self, response): item_loader = ItemLoader(item=ScraperItem(), response=response) item_loader.add_xpath('brand', '(//*[@itemprop="title"])[last()]/text()') price = response.xpath( '//*[@id="our_price_display"]/text()').extract_first() item_loader.add_value( 'price', price, MapCompose(lambda i: i[2:].replace(',', '.'), float)) item_loader.add_xpath('price_currency', '(//*[@itemprop="priceCurrency"])[1]/@content') model = response.xpath( '//*[@id="columns"]/div[1]/span[2]/text()').extract_first() item_loader.add_value('model', model) available_sizes = response.xpath( '(//*[contains(@class, "attribute_select")]/option)[.!="Select Size"]/text()' ).extract() item_loader.add_value('available_sizes', available_sizes) full_product_name = response.xpath( '//*[@class="h4"]/text()').extract_first() item_loader.add_value('color', full_product_name.replace(model, ''), MapCompose(str.strip)) image_src = response.xpath( '(//*[@itemprop="image"])[1]/@src').extract_first() item_loader.add_value('image', image_src) description = response.xpath( '//*[@id="prod-desc"]/div/p/text()').extract_first() item_loader.add_value('description', description) is_discounted = len( response.xpath( '//*[contains(@class, "price_reduced")]').extract()) != 0 item_loader.add_value('is_discounted', is_discounted) inner_id = response.xpath( '//*[@name="id_product"]/@value').extract_first() item_loader.add_value('inner_id', inner_id) inner_category = response.xpath( '//*[@itemtype="http://data-vocabulary.org/Breadcrumb"][last()-1]/a[1]/@title' ).extract_first() item_loader.add_value( 'db_category', self.category_resolver.resolve(inner_category, inner_id)) # SJS has only clothing for men item_loader.add_value('gender', 'm') item_loader.add_value('resource', 'slamjamsocialism') item_loader.add_value('url', response.url) item_loader.add_value('date', str(datetime.datetime.now())) return item_loader.load_item()
def parse_details(self, response): item = ScraperItem() soup = BeautifulSoup(response.text, 'lxml') usefulH2 = soup.find_all('h2')[1] # Event name & URL item['title'] = usefulH2.string item['uri'] = response.url # Build the description from multiple paragraphs that may contain additional tags. leftPs = usefulH2.find_next_sibling('div', {'class':'att-detail-left-col'}).find_all('p') item['description'] = leftPs[0].text for thisP in leftPs[1:(len(leftPs) - 2)]: item['description'] = item['description'] + " " + thisP.text item['description'] = item['description'].replace("\xa0", " ") item['description'] = item['description'].replace("\n", " ") # Date and time item = visitingMontgomeryHelper.parseDateTimeString(leftPs[len(leftPs) - 2].text, item) # Address rightPs = usefulH2.find_next_sibling('div', {'class':'att-detail-right-col'}).find_all('p') item = visitingMontgomeryHelper.parseAddressString(rightPs[0].string, item) return item
def parse_item(self, response): # Check if this item is still in stock if response.xpath('//*[@id="sold-out-div"]').extract_first(): print(response.url, "\nIS SOLD OUT!") return item_loader = ItemLoader(item=ScraperItem(), response=response) item_loader.add_xpath('brand', '(//*[@class="u-color--black"])[1]/text()', MapCompose(str.strip)) price = response.xpath('//*[@id="tr-pdp-price--sale"]/span[1]/text()').extract_first() if price: item_loader.add_value('is_discounted', True) else: item_loader.add_value('is_discounted', False) price = response.xpath('//*[@id="tr-pdp-price"]/span[1]/text()').extract_first() # removing '$' at the beginning of price string and casting to float item_loader.add_value('price', price, MapCompose(lambda i: ''.join(ch for ch in i if ch.isdigit()), float)) # Supposedly currency of all 'forward' items will be in dollars as we asked so in cookies item_loader.add_value('price_currency', 'USD') model = response.xpath('//*[@class="product_name"]/text()').extract_first() item_loader.add_value('model', model) # Should we add last in stock info here as well? available_sizes = response.xpath('(//*[@id="size-select"]/option)[.!="Select Size"]/text()').extract() if not available_sizes: available_sizes = ['One Size'] item_loader.add_value('available_sizes', filter(lambda size: not size.endswith("(Sold Out)"), map(str.strip, available_sizes))) color = response.xpath('//*[@id="color-select"]/option[1]/text()').extract_first() if not color: color = response.xpath('//*[contains(@class, "color_dd")]/div[1]/text()').extract_first() item_loader.add_value('color', color, MapCompose(str.strip)) image_src = response.xpath('//*[@class="product-detail-image"]/@src').extract_first() item_loader.add_value('image', image_src) gender = response.xpath('//*[@class="nav-toggle__item current"]/a[1]/text()').extract_first() if gender == "MENS": item_loader.add_value('gender', 'm') else: item_loader.add_value('gender', 'w') inner_id = response.xpath('//*[@class="product_detail"]/ul[1]/li[last()]/text()').extract_first() inner_id = inner_id.replace('Manufacturer Style No. ', '') item_loader.add_value('inner_id', inner_id) inner_categories = list(map(str.strip, response.xpath('//*[@id="ctaMainBtn"]/button[1]/@data-category').extract_first().split(':'))) item_loader.add_value('db_category', self.category_resolver.resolve(inner_categories, inner_id, gender)) item_loader.add_value('resource', "forward") item_loader.add_value('url', response.url) item_loader.add_value('date', str(datetime.datetime.now())) return item_loader.load_item()
def bruteForceParseEvent(event): # print("!!!!!! ", str(event)) eventDict = ast.literal_eval(str(event)) item = ScraperItem() item['title'] = eventDict['name'] item['uri'] = eventDict['url'] item['description'] = eventDict['description'] startsAtDate = datetime.strptime(eventDict['startDate'], "%Y-%m-%dT%H:%M:%S+00:00") item['starts_at'] = datetime.strftime(startsAtDate, "%Y-%m-%dT%H:%M") endsAtDate = datetime.strptime(eventDict['endDate'], "%Y-%m-%dT%H:%M:%S+00:00") item['ends_at'] = datetime.strftime(endsAtDate, "%Y-%m-%dT%H:%M") if "location" in eventDict: # print("======== ", eventDict['location']) locationDict = ast.literal_eval(str(eventDict['location'])) if "name" in locationDict: item['location_name'] = locationDict['name'] if "address" in locationDict: addressDict = ast.literal_eval(str(locationDict['address'])) if "streetAddress" in addressDict: item['location_street1'] = addressDict['streetAddress'] if "addressLocality" in addressDict: item['location_city'] = addressDict['addressLocality'] if "addressRegion" in addressDict: item['location_state'] = addressDict['addressRegion'] if "postalCode" in addressDict: item['location_zip'] = addressDict['postalCode'] return item
def parse_actor(self, response): logging.info("start scrap data from actor sites") info = response.xpath('//table[@class="infobox biography vcard"]') actorItem = ScraperItem() actorItem['actorName'] = info.xpath('//tr/th/span/text()')[0].extract() actorItem['actorAge'] = info.xpath( '//tr/th[contains(text(),"Born")]/following::td[1]/text()' ).extract() actorItem['movielist'] = response.xpath( '//h2/span[contains(text(),"Filmography")]/following::ul[1]/li/i/a/text()' ).extract() self.actor_count += 1 logging.debug("actor scraped: ") print(self.actor_count) yield actorItem for sel in response.xpath( '//h2/span[contains(text(),"Filmography")]/following::ul[1]/li' ): movieUrl = sel.xpath('i/a/@href').extract()[0] if (movieUrl is not None) and (self.movie_count <= 125): logging.info("movie url is valid") yield response.follow(movieUrl, callback=self.parse_movie)
def parse_page(self, response): # use scrapy shell to find xpath # from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item["url"] = response.url item["date"] = " ".join(response.xpath( "//small[@class='p-time']/child::node()/text()" ).extract()) item["text"] = " ".join(response.xpath( "//div[@class='p-con']/child::node()" ).extract()) try: item["title"] = response.xpath( "//div[@class='p-head']/h1/text()" ).extract()[0] except IndexError: item["title"] = "" try: item["comment_count"] = response.xpath("//div[@id='comments']/h2/text()").extract()[0] except IndexError: item["comment_count"] = 0 yield item
def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response #inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( '//h3[@class="entry-header"]/text()').extract()[0] except IndexError: item['title'] = "" try: item['text'] = " ".join( response.xpath( '//div[@class="entry-body"]/child::node()').extract()) except IndexError: item['text'] = '' try: item['date'] = response.xpath( "//p[@class='entry-footer']/text()").extract()[0] except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( "//p[@class='entry-footer']/a/text()").extract()[2] except IndexError: item['comment_count'] = '0' yield item
def parse_reviews(self, response): """ Parse review page """ data = json.loads(response.text)['avaliacao'] if data['quantidadeAvaliacoes'] is None: return for review in data['avaliacoes']: yield ScraperItem( title=review['titulo'], text=review['descricao'], rating=review['nota'], ) if data['quantidadeAvaliacoes'] > response.meta['page'] * 5: product_id = response.meta['id'] next_page = response.meta['page'] + 1 url = self.review_url(product_id, next_page) yield scrapy.Request(url, callback=self.parse_reviews, meta={ 'page': next_page, 'id': product_id })
def parse_page(self, response): # use scrapy shell to find xpath #from scrapy.shell import inspect_response #inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( "//div[@class='b-singlepost-wrapper']/h1/text()").extract()[0] except IndexError: item['title'] = "" item['text'] = " ".join( response.xpath("//article[2]/child::node()").extract()) try: date = response.xpath("//time/a/text()").extract() date.append(response.xpath("//time/text()[3]").extract()[0]) item['date'] = " ".join(date) except IndexError: item['date'] = '' try: item['comment_count'] = response.xpath( "//span[@class='js-amount'][1]/text()").extract() except IndexError: item["comment_count"] = "0" yield item
def parse_movie(self, response): logging.info("start scrap data from movie sites") table = response.xpath('//table[@class="infobox vevent"]') movieItem = ScraperItem() movieItem['movieName'] = table.xpath('//tr/th/text()')[0].extract() movieItem['movieYear'] = table.xpath( '//tr/th/div[contains(text(),"Release date")]/following::td/div/ul/li[1]/text()' ).extract_first() movieItem['movieGrossing'] = table.xpath( '//tr/th[contains(text(),"Box office")]/following::td/text()' ).extract_first() movieItem['actorlist'] = response.xpath( '//h2/span[contains(text(),"Cast")]/following::ul[1]/li/a/text()' ).extract() self.movie_count += 1 logging.debug("movie scraped: ") print(self.movie_count) yield movieItem for sel in response.xpath( '//h2/span[contains(text(),"Cast")]/following::ul[1]/li'): actorUrl = sel.xpath('a/@href').extract()[0] if (actorUrl is not None) and (self.actor_count <= 250): logging.info("actor url is valid") yield response.follow(actorUrl, callback=self.parse_actor)
def parse_page(self, response): # use scrapy shell to find xpath # from scrapy.shell import inspect_response # inspect_response(response) item = ScraperItem() item['url'] = response.url try: item['title'] = response.xpath( '//h3/text()' ).extract()[0] except IndexError: item['title'] = "" try: item['text'] = " ".join( response.xpath("//div[@class='body'][1]/child::node()").extract()) except IndexError: item['text'] = '' try: item['date'] = response.xpath('//div[@class="index"][1]/a[1]/text()').extract() except IndexError: item['date'] = '' item['comment_count'] = response.xpath( '//div[@class="index"][1]/a[2]/text()' ).extract() yield item
def parse(self, response): item = ScraperItem() item["scrape_target"] = ScrapeTarget.objects.get( scrape_url=response.url) item["scrape_content"] = response.css( ".c-rte--default p.rte__paragraph").get() item["scrape_time"] = str(datetime.utcnow()) yield item
def parse_items(self, response): p_tags = response.xpath('//p/text()').extract() content = CleanTag(p_tags) currentPage = response.request.url item = ScraperItem() item['page'] = currentPage item['content'] = content yield item return
def parse(self, response): """ Parse items from search page """ data = json.loads(response.text)['complainResult']['complains']['data'] for review in data: yield ScraperItem( title=review['evaluation'], text=review['description'], rating=review['score'], )
def parse_movie_page(self, response): self.logger.info('Parse function called on %s', response.url) item = ScraperItem() item['title'] = response.css( '.title_wrapper > h1 ::text').extract_first() item['genres'] = ", ".join( response.css( '.see-more.inline.canwrap > a[href*=title_type] ::text'). extract()) item['rating'] = response.css( '.ratingValue > strong > span ::text').extract_first() item['stars'] = ", ".join( response.css('td:nth-child(2) > a ::text').extract()) item['type'] = '-' #Я не увидел, где это поле находится на странице item['details'] = { 'Official Sites: ': ", ".join( response.css( '#titleDetails >div> a[href*=offsite] ::text').extract()), 'Country: ': ", ".join( response.css( '#titleDetails > div > a[href*=country_of_origin] ::text'). extract()), 'Language: ': ", ".join( response.css( '#titleDetails >div > a[href*=primary_language] ::text'). extract()), #'Release Date: ': response.css('#titleDetails > div:nth-child(5)').extract(), #'Also Known As: ': response.css('div.txt-block:nth-child(6)').extract(), 'Filming Locations: ': response.css('#titleDetails >div > a[href*=locations] ::text'). extract_first() }, item['box_office'] = { #'Opening Weekend USA: ': response.css('div.txt-block:nth-child(11)').extract(), #'Gross USA: ': response.css('#titleDetails > div:nth-child(12)').extract(), #'Cumulative Worldwide Gross: ': response.css('#titleDetails > div:nth-child(13)').extract(), }, item['technical_spec'] = { 'Runtime': response.css('.txt-block > time ::text').extract_first(), 'Sound Mix: ': ", ".join( response.css( '.txt-block > a[href*=sound_mixes] ::text').extract()), 'Color: ': response.css( '.txt-block > a[href*=colors] ::text').extract_first(), #'Aspect Ratio: ': response.css('h3 + .txt-block + .txt-block + .txt-block + .txt-block > h4 ').extract(), }, yield item
def parse_actor(self, response): logging.info("start scrap data from actor sites") info = response.xpath('//table[@class="infobox biography vcard"]') actorItem = ScraperItem() actorItem['actorName'] = info.xpath('//tr/th/span/text()')[0].extract() actorItem['actorAge'] = info.xpath( '//tr/th[contains(text(),"Born")]/following::td[1]/text()' ).extract() yield actorItem
def save_page(self, response): """Write the page and assets to local storage.""" file = response.url.split("/")[-1] filename = os.path.join('/home/vagrant/sync/scraper/files', file) with open(filename, 'wb') as f: f.write(response.body) item = ScraperItem() file_urls = response.xpath('//img/@src').extract() item['file_urls'] = [self.process_url(url) for url in file_urls] yield item
def parse_job(self, response): job = ScraperItem() sel = Selector(response) job['url'] = response.url job_offer = sel.xpath('//title/text()').extract() job_offer = job_offer[0].strip() job_offer = job_offer.split('-') job['name'] = job_offer[0] job["email"] = None job["phone"] = None return job
def product_page_cb(self, response: Response) -> ScraperItem: sel = Selector(response, type="html") # metadata can help extracting more reliable data (json-ld, microdata, ...) meta = utils.extract_metadata(url=response.url) self.logger.info(meta) item = ScraperItem() item["url"] = response.url item["title"] = sel.xpath("//h1/text()").get() return item
def parse_movie(self, response): item = ScraperItem() item['title'] = response.xpath('//h1/text()').extract_first() item['year'] = response.xpath( '//*[@id="titleYear"]/a/text()').extract_first() # rating= response.xpath('//*[@itemprop="ratingValue"]/text()').extract_first() # director=response.xpath('//*[@id="title-overview-widget"]/div[2]/div[1]/div[2]/a/text()').extract_first() # writer=response.xpath('//*[@id="title-overview-widget"]/div[2]/div[1]/div[3]/a[1]/text()').extract_first() # release_date=response.xpath('//*[@id="titleDetails"]/div[4]/text()').extract() # release_date=release_date[1].strip() yield item
def tdoll_parser(self, response): doll = ScraperItem() res = response.xpath('//h1[@id="firstHeading"]').get() soup = BeautifulSoup(res, "lxml") doll['name'] = soup.get_text('h1') skillnamexpath = response.xpath('//div[@class="skilldataraw"]//tr[1]//td[1]').get() soup = BeautifulSoup(skillnamexpath, 'lxml') skill_name = str(soup.get_text('td')) doll['skill_name'] = skill_name print("parser for tdoll " + doll['name'] + ' skill name = ' + doll['skill_name']) #self.links.append(doll) yield doll
def parse(self, response): for joke in response.xpath("//div[@class='jokes']"): l = ItemLoader(item=ScraperItem(), selector=joke) l.add_xpath('joke_text', ".//div[@class='joke-text']/p") # yield { # 'joke_text': joke.xpath(".//div[@class='joke-text']/p").extract_first() # } yield l.load_item() next_page = response.xpath( "//li[@class='next']/a/@href").extract_first() if next_page is not None: next_page_link = response.urljoin(next_page) yield scrapy.Request(url=next_page_link, callback=self.parse)
def parse_movie(self, response): logging.info("start scrap data from movie sites") table = response.xpath('//table[@class="infobox vevent"]') movieItem = ScraperItem() movieItem['movieName'] = table.xpath('//tr/th/text()')[0].extract() movieItem['movieYear'] = table.xpath( '//tr/th/div[contains(text(),"Release date")]/following::td/div/ul/li[1]/text()' ).extract_first() movieItem['movieGrossing'] = table.xpath( '//tr/th[contains(text(),"Box office")]/following::td/text()' ).extract_first() yield movieItem