def parse(self, response): item = DaomubijiItem() item['title'] = response.xpath('//h1[@class="focusbox-title"]/text()').extract_first()#提取小说名 items = response.xpath('//article[@class="excerpt excerpt-c3"]') for href in items: detail_href = href.xpath('./a/@href').extract_first()#提取章节的正文链接 yield scrapy.Request(url=detail_href, meta={'item': item}, callback=self.get_content)
def parse(self, response): selector = Selector(response) mulus = selector.xpath( "//div[@class='mulu']/div[@class='mulu-title']/center/h2/text()") boxs = selector.xpath("//div[@class='mulu']/div[@class='box']") for i in range(len(mulus)): mulu = mulus[i] # 提取一个目录 box = boxs[i] # 提取一个box texts = box.xpath(".//ul/li/a/text()").extract() # 将文本提取出来 urls = box.xpath(".//ul/li/a/@href").extract() # 将链接提取出来 for j in range(len(urls)): item = DaomubijiItem() item['bookName'] = mulu.extract() try: item['bookTitle'] = texts[j].split(' ')[0] if len(texts[j].split(' ')) == 3: item['chapterNum'] = texts[j].split(' ')[1] item['chapterName'] = texts[j].split(' ')[2] item['chapterUrl'] = urls[j] request = scrapy.Request(urls[j], callback=self.parseBody) request.meta['item'] = item yield request except Exception as e: print('===', texts[j]) print('excepiton', e) continue
def parse_item(self, response): with open('first.html', 'wb') as f: f.write(response.body) hxp = Selector(response) urls = hxp.select( "//div[@class='excerpts-wrapper']/div[@class='excerpts']/article/a/@href" ).extract() titles = hxp.select( "//div[@class='excerpts-wrapper']/div[@class='excerpts']/article/a/text()" ).extract() item = DaomubijiItem() print '==========================' for url in titles: print url.split(' ')[1] for i in range(len(urls)): print 'i ===汉字是多少' + str(i) arr = titles[i].split(' ') if len(arr) >= 3: item['url'] = urls[0] item['chapter'] = arr[0] item['chapter_num'] = arr[1] item['section'] = arr[2] item['name'] = '盗墓笔记' else: item['url'] = urls[0] item['chapter'] = arr[0] item['chapter_num'] = arr[1] item['name'] = '盗墓笔记' # time.sleep(1) yield item print '=========================='
def parse(self, response): article_lists = response.xpath( "//article[@class = 'excerpt excerpt-c3']") for article in article_lists: item = DaomubijiItem() chapter_name = article.xpath(".//a/text()").extract_first() chapter_urls = article.xpath(".//a/@href").extract_first() item['chapter_name'] = chapter_name item['chapter_urls'] = chapter_urls yield scrapy.Request(url=chapter_urls, meta={'item': item}, callback=self.chapter_detail)
def parse(self, response): book_list = response.xpath("//table") for book_item in book_list: book_name = book_item.xpath(".//center/h2/text()").extract()[0] capture_list = book_item.xpath(".//tr/td/a") for capture_item in capture_list: Items = DaomubijiItem() capture_name = capture_item.xpath("text()").extract()[0] caputre_url = capture_item.xpath("@href").extract()[0] Items["book_name"] = book_name Items["capture_name"] = capture_name Items["caputre_url"] = caputre_url yield Items
def parse_chapter(self, response): item = DaomubijiItem() book_name = response.css('.content .item-3 a::text').extract_first() book_info = self.book_info_dict.get(book_name, '') chapter_name = response.css( '.content .article-title::text').extract_first() datetime = response.css('.content .item-1::text').extract_first() full_content = '' for c_p in response.css('.content .article-content p').extract(): full_content += c_p item['book_name'] = book_name item['book_info'] = book_info item['chapter_name'] = chapter_name item['datetime'] = datetime item['content'] = full_content yield item
def parse_b(self, response): item = DaomubijiItem() book_name = response.css('.item.item-3 a::text').extract() piece = response.css('.article-content p::text').extract() content = "\n".join(piece) chapter_name = response.css('.article-header h1::text').extract_first() item['book_name'] = book_name item['content'] = content item['chapter_name'] = chapter_name yield item next_url = response.css( '.article-nav-next a::attr(href)').extract_first() if next_url: yield Request(url=next_url, callback=self.parse_b) else: print('end')
def parse_title(self, response): # 提取子网页信息 # Save raw data to Archive url_cahracter = response.request.url.split("/")[-1] chapter_file = "files/archives/chapter_index/daomuji_chapter_index_{ct}_{dt}.html".format( ct=url_cahracter, dt=get_now(tab=False)) print >> open(chapter_file, "w"), response.body selector = Selector(response) book_order_name = selector.xpath('//h1/text()').extract()[0] pos = book_order_name.find(u':') book_order = book_order_name[:pos] # 获取书编号 book_name = book_order_name[pos + 1:] # 获取书名 chapter_list = selector.xpath( '//article[@class="excerpt excerpt-c3"]//text()').extract() chapter_link = selector.xpath( '//article[@class="excerpt excerpt-c3"]/a/@href').extract() chapter_link_flag = 0 # 链接序号 for each in chapter_list: pos_first = each.find(' ') pos_last = each.rfind(' ') # chapter_first = '' chapter_mid = '' # chapter_last = '' if pos_first != pos_last: chapter_first = each[:pos_first] chapter_mid = each[(pos_first + 1):pos_last] chapter_last = each[pos_last + 1:] else: chapter_first = each[:pos_first] chapter_last = each[pos_last + 1:] # 存储信息 item = DaomubijiItem() item['bookOrder'] = book_order item['bookName'] = book_name item['chapterFirst'] = chapter_first item['chapterMid'] = chapter_mid item['chapterLast'] = chapter_last yield Request(chapter_link[chapter_link_flag], callback='parse_content', meta={'item': item}) chapter_link_flag += 1
def parse(self, response): sel = scrapy.selector.Selector() tables = sel.xpath('//table') items = [] for each in tables: links = each.xpath('tr/td/a/@href').extract() #not tbody contents = each.xpath('tr/td/a/text()').extract() #not tbody for i in range(len(links)): #forget range!!!!!!!!!! item = DaomubijiItem() item['link'] = links[i] try: item['book'] = contents[i].split(' ')[0] item['chapter'] = contents[i].split(' ')[1] except Exception,e: continue try: item['bookname'] = contents[i].split(' ')[2] except Exception,e: item['bookname'] = contents[i].split(' ')[1][-3:] items.append(item)
def parse_title(self, response): # 提取子网页信息 selector = Selector(response) book_order_name = selector.xpath('//h1/text()').extract()[0] pos = book_order_name.find(u':') book_order = book_order_name[:pos] # 获取书编号 book_name = book_order_name[pos + 1:] # 获取书名 chapter_list = selector.xpath( '//article[@class="excerpt excerpt-c3"]//text()').extract() chapter_link = selector.xpath( '//article[@class="excerpt excerpt-c3"]/a/@href').extract() chapter_link_flag = 0 # 链接序号 for each in chapter_list: pos_first = each.find(' ') pos_last = each.rfind(' ') chapter_first = '' chapter_mid = '' chapter_last = '' if pos_first != pos_last: chapter_first = each[:pos_first] chapter_mid = each[(pos_first + 1):pos_last] chapter_last = each[pos_last + 1:] else: chapter_first = each[:pos_first] chapter_last = each[pos_last + 1:] # 存储信息 item = DaomubijiItem() item['bookOrder'] = book_order item['bookName'] = book_name item['chapterFirst'] = chapter_first item['chapterMid'] = chapter_mid item['chapterLast'] = chapter_last yield Request(chapter_link[chapter_link_flag], callback='parse_content', meta={'item': item}) chapter_link_flag += 1
def parse(self, response): selector = Selector(response) table = selector.xpath('//table') items = [] for each in table: content = each.xpath('tr/td/a/text()').extract() url = each.xpath('tr/td/a/@href').extract() for i in range(len(url)): item = DaomubijiItem( ) #为了防止后一个数据覆盖前一个数据,需要在每个循环里都实例化一个NovelspiderItem item['link'] = url[i] # try可以用于检测错误,出现错误以后就会运行except里面的内容。 try: item['book'] = content[i].split(' ')[0] item['chapter'] = content[i].split(' ')[1] except Exception, e: continue try: item['bookname'] = content[i].split(' ')[2] except Exception, e: item['bookname'] = content[i].split(' ')[1][-3:] items.append(item)