コード例 #1
0
ファイル: qidian.py プロジェクト: Syhen/hmqf_crawler_hy
 def _parse_one_chapter(self, element, url):
     item = ChapterListItem()
     item['title'] = element.xpath('./a/text()')[0]
     item['url'] = urljoin(url, element.xpath('./a/@href')[0])
     update_str = element.xpath('./a/@title')[0]
     item['updated_at'] = update_str.rsplit(' ', 1)[0].split(u':')[-1]
     item['word_count'] = int(update_str.rsplit(' ', 1)[-1].split(u':')[-1])
     return item
コード例 #2
0
 def parse_chapter_list(self, content, url):
     try:
         sel = etree.HTML(content)
     except ValueError:
         raise ValueError("can't parse any volume")
     chapters = sel.xpath('//ul[@class="ListRow"]/li/a')[:-2]
     chapter_ordinal = 1
     for chapter in chapters:
         try:
             item = ChapterListItem()
             item['url'] = urljoin(url, chapter.xpath('./@href')[0])
             item['title'] = chapter.xpath('./text()')[0]
             item['updated_at'] = None
             item['word_count'] = 0
             item['chapter_ordinal'] = chapter_ordinal
             chapter_ordinal += 1
             yield item
         except Exception as e:
             self.logger.error(e)
コード例 #3
0
ファイル: biquge.py プロジェクト: bosslsk/hmqf_crawler_hy
 def parse_chapter_list(self, content, url):
     try:
         sel = etree.HTML(content)
     except ValueError:
         raise ValueError("can't parse any volume")
     chapters = sel.xpath('//div[@id="list"]/dl/dd')[9:]
     self.logger.debug(chapters)
     chapter_ordinal = 1
     for chapter in chapters:
         item = ChapterListItem()
         try:
             item['title'] = chapter.xpath('./a/text()')[0]
             item['url'] = chapter.xpath('./a/@href')[0]
             item['updated_at'] = ''
             item['word_count'] = 0
             item['chapter_ordinal'] = chapter_ordinal
             chapter_ordinal += 1
             yield item
         except Exception as e:
             self.logger.error(e)
コード例 #4
0
ファイル: xxsy.py プロジェクト: bosslsk/hmqf_crawler_hy
 def parse_chapter_list(self, content, url):
     try:
         sel = etree.HTML(content)
     except ValueError:
         raise ValueError("can't parse any volume")
     chapters = sel.xpath('//ul[@class="catalog-list cl"]/li')
     chapter_ordinal = 1
     for chapter in chapters:
         if chapter.xpath('./i[@class="iconfont"]'):
             break
         item = ChapterListItem()
         try:
             item['title'] = chapter.xpath('./a/text()')[0]
             item['url'] = urljoin(url, chapter.xpath('./a/@href')[0])
             item['updated_at'] = ''
             item['word_count'] = 0
             item['chapter_ordinal'] = chapter_ordinal
             chapter_ordinal += 1
             yield item
         except Exception as e:
             self.logger.error(e)