def parse_biquge_chapter(self, response): """笔趣阁章节列表 :param response: :return: """ if '不存在的网页' in response.body: item = response.meta['data'] yield Request(url=item['url'], meta={'item': item}, callback=self.parse_qidian, dont_filter=True) chapters = response.xpath('//div[@id="list"]/dl/dd')[9:] chapter_id = 1 for chapter in chapters: item = MaterialContentItem() item.update(response.meta['data']) item['title'] = chapter.xpath('./a/text()').extract()[0] item['ordinal'] = chapter_id chapter_url = chapter.xpath('./a/@href').extract()[0] item['created_at'] = today_date() item['updated_at'] = today_date() chapter_id += 1 yield Request(url=chapter_url, callback=self.parse_biquge_content, meta={'item': item}, dont_filter=True)
def parse_sougou_chapter(self, response): """ 搜狗小说网章节列表 :param response: :return: """ chapters = response.xpath('//div[@class="listmain"]/dl/dd') chapter_id = 1 for chapter in chapters: item = MaterialContentItem() item.update(response.meta['item']) try: item['title'] = chapter.xpath('./a/text()').extract()[0] except IndexError: item['title'] = chapter.xpath('.//b/text()').extract()[0] item['ordinal'] = chapter_id chapter_id += 1 item['created_at'] = today_date() item['updated_at'] = today_date() chapter_url = response.urljoin(chapter.xpath('./a/@href').extract()[0]) yield Request( url=chapter_url, meta={'item': item}, callback=self.parse_sougou_content, dont_filter=True )
def parse_detail(self, response): try: detail_json = json.loads(response.body) except ValueError: self.logger.error('[NO JSON]' + response.url) with open('no_json_decoded.log', 'w') as f: f.write(response.body) return if 'code' in detail_json: return clock = int(detail_json['islock']) if clock: self.logger.debug('[THIS BOOK IS CLOCK] ' + response.url) return item = MaterialSourceItem() item.update(response.meta['item']) item['relate_id'] = '%s_%s' % (item['source_id'], item['book_id']) item['url'] = 'http://www.jjwxc.net/onebook.php?novelid={}'.format( item['book_id']) item['folder_url'] = detail_json['novelCover'] item['title'] = detail_json['novelName'] item['author'] = detail_json['authorName'] item['gender'] = u'女性向小说' introduction = HTMLParser().unescape(detail_json['novelIntro']) item['introduction'] = '\n'.join(p.strip() for p in introduction.split('<br/>') if p != '') item['created_at'] = now_date() item['updated_at'] = today_date() yield item
def parse_detail(self, response): """ 抓取起点作品详情 :param response: :return: """ item = response.meta['item'] xpath_folder_url = '//div[@class="book-information cf"]/div[@class="book-img"]/a/img/@src' xpath_title = '//div[@class="book-information cf"]/div[@class="book-info "]/h1/em/text()' xpath_author = '//div[@class="book-information cf"]//a[@class="writer"]/text()' xpath_introduction = '//div[@class="book-intro"]/p/text()' item['url'] = response.url item['relate_id'] = '%s_%s' % (item['source_id'], item['book_id']) item['folder_url'] = response.urljoin( response.xpath(xpath_folder_url).extract()[0]).strip() item['title'] = response.xpath(xpath_title).extract()[0] try: item['author'] = response.xpath(xpath_author).extract()[0] except IndexError: return item['gender'] = u'男性向小说' introduction = response.xpath(xpath_introduction).extract() item['introduction'] = '\n'.join( p.replace(u' ', '').strip() for p in introduction) item['created_at'] = now_date() item['updated_at'] = today_date() yield item
def parse_qidian_content(self, response): """ 起点章节内容 :param response: :return: """ item = response.meta['item'] item['chapter_url'] = response.url item['created_at'] = today_date() item['updated_at'] = today_date() item['content'] = str2binary( text_format( response.xpath( '//div[@class="read-content j_readContent"]/p/text()'). extract())) item['status'] = 1 yield item
def remove_mongo_data(spider_name): """ 清空mongodb中的相关数据 :param spider_name: :return: """ source_id, data_coll = generate_id_coll(spider_name) if source_id <= 21: mongo_db[data_coll].remove({ 'source_id': source_id, 'updated_at': today_date() }) else: mongo_db[data_coll].remove({})