def parse(self, response): """ This parsing will be used to get all quotes from a single page. """ self.state['items_count'] = self.state.get('items_count', 0) + 1 quotes_list = response.xpath('/html/body/div/div[2]/div[1]/div') if not quotes_list: yield None for quote in quotes_list: item = QuotesItem() item['quote'] = quote.xpath('.//span[1]/text()')[0].extract() item['author'] = quote.xpath( './/span[2]/small/text()')[0].extract() tags = quote.xpath('.//div/a/text()').extract() item['tags'] = str(tags) author_url = quote.xpath('.//span[2]/a/@href')[0].extract() cleaned_author_url = self.clean_url(author_url) # Note the don't_filter parameter. It will allow for duplicate parsing under this call. author_description_request = scrapy.Request( cleaned_author_url, callback=self.parse_author_detail_page, meta={'item': item}, dont_filter=True, errback=self.handle_error) yield author_description_request
def parse_author(self, response): # Extract the author nationality author_nationality = response.xpath( '//div[contains(@class, "bqLn") and contains(text(), "ationality")]/a/text()' ).extract_first() quote_author_path = '//div[contains(@id, "quotesList")]//div[contains(@class,"bqQt")]' #quote_list = [] for individual_quote_author in response.xpath(quote_author_path): individual_quote_author_url = individual_quote_author.xpath( './a/img/@src').extract_first() # Get the item from the last response item = QuotesItem(response.meta['item']) item['image_urls'] = [] # Validate the individual quote author url has an image if individual_quote_author_url: individual_quote_author_url = response.urljoin( individual_quote_author_url) item['image_urls'].append(individual_quote_author_url) individual_quote_author_path = './/span[contains(@class, "uote") and contains(@class, "ink")]/a[contains(@href, "quotes")]/text()' individual_quote_author_text = individual_quote_author.xpath( individual_quote_author_path).extract_first() item['author_nationality'] = author_nationality item['quote_text'] = individual_quote_author_text yield item
def parse(self, response): print("爬取页面 "+ response.url) content_list = response.xpath('//div[@class="quote"]/span[@class="text"]/text()').extract() auth_list = response.xpath('//div[@class="quote"]//small[@class="author"]/text()').extract() item = QuotesItem() for i,j in zip(auth_list,content_list): item['author'] = i item['content'] = eval(j) yield item
def parse(self, response): items = QuotesItem() quotes = response.xpath("//div[@class='quote']") for item in quotes: quote = item.xpath("./span[@class='text']/text()").extract()[0] author = item.xpath("./span")[1].xpath( "./small[@class='author']/text()").extract()[0] items["quote"] = quote items["author"] = author yield items
def parse(self, response): for quote in response.css('.quote'): item = QuotesItem(quote=quote.css('.text::text').get(), author=quote.css('.author::text').get(), author_url=response.urljoin( quote.css('.author a::attr(href)').get()), tags=quote.css('.tag *::text').getall()) yield item yield scrapy.Request( response.urljoin(response.css('.next a::attr(href)').get()))
def parse(self, response): quotes = response.css("div.quote") for quote in quotes: loader = ItemLoader(item=QuotesItem(), selector=quote) loader.add_css("author", ".author::text") loader.add_css("quote", ".text::text") loader.add_css("tags", ".tag::text") yield loader.load_item()
def parse(self, response): item = QuotesItem() for quote in response.css('div.quote'): item['content'] = quote.css('span.text::text').extract_first() item['author'] = quote.css('small.author::text').get() item['tags'] = ','.join(quote.css('div.tags a.tag::text').getall()) yield item next_page_url = response.css('li.next > a::attr(href)').extract_first() if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url))
def parse_item(self, response): self.log('Scraping: ' + response.url) articles = response.css('.quote') for article in articles: item = QuotesItem() item['text'] = article.css('::text').extract()[1].strip() item['author'] = article.css('::text').extract()[4].strip() yield item
def parse_author(self, response): item = QuotesItem() descripition = response.xpath( '//div[@class="author-description"]/text()').extract_first().strip( ) item['description'] = descripition item['content'] = response.meta['content'] item['author'] = response.meta['author'] item['tags'] = response.meta['tags'] item['author_url'] = response.meta['author_url'] yield item
def parse(self, response): quotes = response.css('.quote') for quote in quotes: q_item = QuotesItem() q_item['text'] = quote.css('.text::text').extract_first() q_item['author'] = quote.css('.author::text').extract_first() q_item['tags'] = quote.css('.tags .tag::text').extract() yield q_item next = response.css('.pager .next a::attr("href")').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
def extractData(self, res): q = QuotesItem() for quote in res.css('div.quote'): q['quote'] = '"' + re.sub( r'[^\x00-\x7f]', r'', quote.css('span.text::text').extract_first()) + '"' q['author'] = quote.css('small.author::text').extract_first() q['tags'] = ' '.join( str(s) for s in quote.css('div.tags > a.tag::text').extract()) self.writeTxt(q)
def parse(self, response): quotes = response.css(".quote") for quote in quotes: item = QuotesItem() item['text'] = quote.css(".text::text").extract_first() item['author'] = quote.css(".author::text").extract_first() item['tags'] = quote.css(".tags .tag::text").extract() yield item next = response.css(".next a::attr(href)").extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
def parse_quotes(self, response): for (i, quote) in enumerate(response.css('.quote')): text = quote.css('.text::text').get() author = quote.css('.author::text').get() if not self.author or author == self.author: yield QuotesItem(text=text, author=author, url=response.url, rank=i, scrape_date=datetime.now().isoformat()) url = response.css('.pager .next a::attr(href)').get() if url: yield response.follow(url, callback=self.parse_quotes)
def parse(self, response): html_author = '//div/h2[contains(text(), "opular") and contains(text(), "thors")]/..//div[contains(@class, "bqLn")]' for individual_author in response.xpath(html_author): author_name = individual_author.xpath('./a/text()').extract_first() author_link = individual_author.xpath('./a/@href').extract_first() full_author_url = response.urljoin(author_link) ## Declare information Item for authors item = QuotesItem() item['author_name'] = author_name yield scrapy.Request(full_author_url, callback=self.parse_author, meta={'item': item}) break
def parse(self, response): for quote in response.xpath("//div[@class='quote']"): item = QuotesItem() item['quote'] = quote.xpath( "span[@class='text']/text()").extract_first() item['author'] = quote.xpath( "span/small[@class='author']/text()").extract_first() item['tags'] = quote.xpath( "div[@class='tags']/a[@class='tag']/text()").extract() yield item next_page = response.xpath( "//ul[@class='pager']/li[@class='next']/a/@href").extract_first() next_page = "http://quotes.toscrape.com" + next_page yield scrapy.http.Request(next_page, callback=self.parse)
def parse(self, response): for (i, quote) in enumerate(response.css('.quote')): text = quote.css('span.text::text').get() author = quote.css('.author::text').get() yield QuotesItem(text=text, author=author, url=response.url, rank=i, scrapy_date=datetime.now().isoformat()) url = response.css('.pager .next a::attr(href)').get() if url is not None: yield response.follow(url)
def parse(self, response): quote_list = response.xpath('//div[@class= "quote"]') item = QuotesItem() for quote in quote_list: item['quote'] = quote.xpath( './/span[@class = "text"]/text()').get() item['author'] = quote.xpath( './/span/small[@class = "author"]/text()').get() item['tag'] = quote.xpath( './/div[@class = "tags"]/a/text()').getall() if item['tag'] == []: item['tag'] = ['no_tag'] yield item next_page = response.xpath('//li[@class="next"]/a/@href').get() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): quotes = response.css('.quote') for quote in quotes: text = quote.css('.text::text').extract_first() author = quote.css('.author::text').extract_first() tags = quote.css('.tags .tag::text').extract() item = QuotesItem() item['text'] = text item['author'] = author item['tags'] = tags yield item next = response.css('.pager .next a::attr(href)').extract_first() next_url = response.urljoin(next) yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): quote_sel= response.xpath('//div[@class="quote"]') for sel in quote_sel: #print i quote = sel.xpath('span[@class="text"]/text()').extract_first() #print quote author = sel.xpath('span/small[@class="author"]/text()').extract_first() #print author item = QuotesItem( quote = quote, author = author) yield item next_url = response.xpath('//li[@class="next"]/a/@href').extract_first() next = response.urljoin(next_url) #print next if next_url: yield Request(url= next)
def parse(self, response): item = QuotesItem() for quote in response.css('div.quote'): item['quotes'] = quote.css( 'span.text::text').extract_first().replace("'", "\\'") item['author'] = quote.css('small.author::text').extract_first() link = quote.css('a::attr(href)').extract_first() item['author_link'] = response.urljoin(link) item['tags'] = ';'.join( quote.css('div.tags a.tag::text').extract()) yield item self.page += 1 next_page = response.css('li.next a::attr(href)').extract_first() if next_page and self.page < 3: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse_product(self, response): for info in response.xpath( '//div[contains(@id, "quotesList")]').extract(): for t in info: hrefs = response.xpath( '//div[contains(@class, "qti-listm")]/a/@href').extract() #lines = response.xpath("//div[@class='qti-listm']//a/img/@alt").extract().split(',')[0] lines = [ line.split(",") for line in response.xpath( "//div[@class='qti-listm']//a/img/@alt").extract() ] lines = [line[0] for line in lines] for item in zip(hrefs, lines): new_item = QuotesItem() new_item['hrefs'] = item[0] new_item['lines'] = item[1] yield new_item
def parse(self, response): # 解析数据 Source = response.css(".tablelist1 td") message = Source.css("::text").extract() message2 = Source.css("::attr(onclick)").extract() message3 = response.css('#content .default::text').extract() temp = message3.copy() for i in temp: if '--' in i: message3.remove(i) if '杭州' in message3[1]: message3[1] = message3[1][2:] message3[0] = message3[0] + '市 ' message3.insert(0, '浙江省 ') message3 = ''.join(message3) time.sleep(0.1) # 1 获取日期和title # 1 _title = [] # 1 for i in message[0::3]: # 1 _title.append(i) # 访问二级页面 for index, value in enumerate(message2): item = QuotesItem() # 1 item['title'] = _title[index] # item['aera'] = message3 # item['_url'] = 'http://www.zjzxts.gov.cn/wsdt/wsdtHtml/xfjxq.jsp?id='+ value[6:-2] url = 'http://www.zjzxts.gov.cn/wsdt/wsdtHtml/xfjxq.jsp?id=' + value[ 6:-2] time.sleep(0.1) yield scrapy.Request(url, meta={ 'item': item, 'Aera': message3, 'Url': url }, callback=self.detail_parse2) current_page = response.css('#content .paginList #cp::text').extract() current_page = str(int(current_page[0])) if (current_page != '10'): one_url = response.css( '#content .paginList a::attr(href)').extract_first() next_url = 'http://www.zjzxts.gov.cn' + one_url[: -5] + current_page + '&bt=' time.sleep(1) yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): for quote in response.xpath('//div[@class="quote"]'): item = QuotesItem() ''' yield { 'text': quote.xpath('span[@class="text"]/text()').extract_first(), 'author': quote.xpath('span/small[@class="author"]/text()').extract_first(), } ''' item['title'] = quote.xpath('span[@class="text"]/text()').extract_first() item['author'] = quote.xpath('span/small[@class="author"]/text()').extract_first() yield item next_page = response.xpath('/html/body/div[1]/div[2]/div[1]/nav/ul/li/a/@href').extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): # get div with quote and author quotes = response.xpath('//div[@class="quoteText"]') for quote in quotes: try: item = QuotesItem() item['quote'] = quote.xpath( 'text()').extract_first().strip()[1:-1] item['author'] = quote.xpath( 'a/text()').extract_first().strip().split(',')[0] item['source'] = self.name yield item except: pass # find the next page link and go next_page = response.xpath( '//a[@class="next_page"]/@href').extract_first() yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
def detail_parse2(self, response): # 接受上级已爬取的数据 time.sleep(2) item = response.meta['item'] source = response.css('.blue1::text').extract() # 二级页数据提取 item = QuotesItem() try: item['url'] = response.meta['Url'] item['date'] = source[1] item['aera'] = response.meta['Aera'] item['source'] = source[2] item['_content'] = source[0] item['department'] = source[3] item['ans_date'] = source[4] item['ans_content'] = source[5] except IndexError: pass return item
def parse(self, response): """ Parsing function for the spider's requests. """ # Remove the line breaks on the html response = response.replace(body=response.body.replace(b"<br>", b"")) for quote in response.css(".quoteDetails"): # Create an item loader with the quote data and add it as a new quote_item self.logger.info("Creating quote_item") loader = ItemLoader(item=QuotesItem(), selector=quote) loader.add_css("quote_content", ".quoteText::text") loader.add_css("author_name", ".quoteText .authorOrTitle::text") loader.add_css("author_image", ".leftAlignedImage img::attr(src)") loader.add_css("tags", ".greyText.smallText.left a::text") quote_item = loader.load_item() yield quote_item # Scrape the next page next_page = response.css("a.next_page::attr(href)").get() if next_page is not None: yield response.follow(next_page, self.parse)
# Standard library imports from scrapy import Spider from scrapy.loader import ItemLoader # Local application imports from quotes.items import QuotesItem quote_item = QuotesItem() class QuotesSpider(Spider): """ Web Scraping Spider for Goodreads website. """ # Class attributes name = "quotes" start_urls = ["https://www.goodreads.com/quotes?page=1"] def parse(self, response): """ Parsing function for the spider's requests. """ # Remove the line breaks on the html response = response.replace(body=response.body.replace(b"<br>", b"")) for quote in response.css(".quoteDetails"): # Create an item loader with the quote data and add it as a new quote_item self.logger.info("Creating quote_item") loader = ItemLoader(item=QuotesItem(), selector=quote) loader.add_css("quote_content", ".quoteText::text") loader.add_css("author_name", ".quoteText .authorOrTitle::text") loader.add_css("author_image", ".leftAlignedImage img::attr(src)")