Example #1
0
 def parse_biquge_chapter(self, response):
     """笔趣阁章节列表
     :param response:
     :return:
     """
     if '不存在的网页' in response.body:
         item = response.meta['data']
         yield Request(url=item['url'],
                       meta={'item': item},
                       callback=self.parse_qidian,
                       dont_filter=True)
     chapters = response.xpath('//div[@id="list"]/dl/dd')[9:]
     chapter_id = 1
     for chapter in chapters:
         item = MaterialContentItem()
         item.update(response.meta['data'])
         item['title'] = chapter.xpath('./a/text()').extract()[0]
         item['ordinal'] = chapter_id
         chapter_url = chapter.xpath('./a/@href').extract()[0]
         item['created_at'] = today_date()
         item['updated_at'] = today_date()
         chapter_id += 1
         yield Request(url=chapter_url,
                       callback=self.parse_biquge_content,
                       meta={'item': item},
                       dont_filter=True)
Example #2
0
 def parse_sougou_chapter(self, response):
     """
     搜狗小说网章节列表
     :param response:
     :return:
     """
     chapters = response.xpath('//div[@class="listmain"]/dl/dd')
     chapter_id = 1
     for chapter in chapters:
         item = MaterialContentItem()
         item.update(response.meta['item'])
         try:
             item['title'] = chapter.xpath('./a/text()').extract()[0]
         except IndexError:
             item['title'] = chapter.xpath('.//b/text()').extract()[0]
         item['ordinal'] = chapter_id
         chapter_id += 1
         item['created_at'] = today_date()
         item['updated_at'] = today_date()
         chapter_url = response.urljoin(chapter.xpath('./a/@href').extract()[0])
         yield Request(
             url=chapter_url,
             meta={'item': item},
             callback=self.parse_sougou_content,
             dont_filter=True
         )
Example #3
0
 def parse_detail(self, response):
     try:
         detail_json = json.loads(response.body)
     except ValueError:
         self.logger.error('[NO JSON]' + response.url)
         with open('no_json_decoded.log', 'w') as f:
             f.write(response.body)
         return
     if 'code' in detail_json:
         return
     clock = int(detail_json['islock'])
     if clock:
         self.logger.debug('[THIS BOOK IS CLOCK] ' + response.url)
         return
     item = MaterialSourceItem()
     item.update(response.meta['item'])
     item['relate_id'] = '%s_%s' % (item['source_id'], item['book_id'])
     item['url'] = 'http://www.jjwxc.net/onebook.php?novelid={}'.format(
         item['book_id'])
     item['folder_url'] = detail_json['novelCover']
     item['title'] = detail_json['novelName']
     item['author'] = detail_json['authorName']
     item['gender'] = u'女性向小说'
     introduction = HTMLParser().unescape(detail_json['novelIntro'])
     item['introduction'] = '\n'.join(p.strip()
                                      for p in introduction.split('<br/>')
                                      if p != '')
     item['created_at'] = now_date()
     item['updated_at'] = today_date()
     yield item
Example #4
0
 def parse_detail(self, response):
     """
     抓取起点作品详情
     :param response:
     :return:
     """
     item = response.meta['item']
     xpath_folder_url = '//div[@class="book-information cf"]/div[@class="book-img"]/a/img/@src'
     xpath_title = '//div[@class="book-information cf"]/div[@class="book-info "]/h1/em/text()'
     xpath_author = '//div[@class="book-information cf"]//a[@class="writer"]/text()'
     xpath_introduction = '//div[@class="book-intro"]/p/text()'
     item['url'] = response.url
     item['relate_id'] = '%s_%s' % (item['source_id'], item['book_id'])
     item['folder_url'] = response.urljoin(
         response.xpath(xpath_folder_url).extract()[0]).strip()
     item['title'] = response.xpath(xpath_title).extract()[0]
     try:
         item['author'] = response.xpath(xpath_author).extract()[0]
     except IndexError:
         return
     item['gender'] = u'男性向小说'
     introduction = response.xpath(xpath_introduction).extract()
     item['introduction'] = '\n'.join(
         p.replace(u' ', '').strip() for p in introduction)
     item['created_at'] = now_date()
     item['updated_at'] = today_date()
     yield item
Example #5
0
 def parse_qidian_content(self, response):
     """
     起点章节内容
     :param response:
     :return:
     """
     item = response.meta['item']
     item['chapter_url'] = response.url
     item['created_at'] = today_date()
     item['updated_at'] = today_date()
     item['content'] = str2binary(
         text_format(
             response.xpath(
                 '//div[@class="read-content j_readContent"]/p/text()').
             extract()))
     item['status'] = 1
     yield item
Example #6
0
def remove_mongo_data(spider_name):
    """
    清空mongodb中的相关数据
    :param spider_name:
    :return:
    """
    source_id, data_coll = generate_id_coll(spider_name)
    if source_id <= 21:
        mongo_db[data_coll].remove({
            'source_id': source_id,
            'updated_at': today_date()
        })
    else:
        mongo_db[data_coll].remove({})