def extractChapterUrlsAndMetadata(self): logger.debug('URL: %s', self.url) data = self.get_request(self.url) soup = self.make_soup(data) info = soup.select_one('.inform-inform-data') self.story.setMetadata('title', stripHTML(info.h3).split(' | ')[0]) self.setCoverImage(self.url, soup.select_one('.inform-product > img')['src']) # Unicode strings because ':' isn't ':', but \xef\xbc\x9a # author = stripHTML(info.h6).split(u' ')[0].replace(u'Auteur : ', '', 1) author = stripHTML(info.h6).split('Babelcheck')[0].replace( 'Auteur : ', '').replace('\xc2\xa0', '') # author = stripHTML(info.h6).split('\xa0')[0].replace(u'Auteur : ', '', 1) self.story.setMetadata('author', author) self.story.setMetadata('authorId', author) ## site doesn't have authorUrl links. datestr = stripHTML( soup.select_one('.newestchapitre > div > a')['href'])[-11:-1] date = makeDate(datestr, '%Y/%m/%d') if date: self.story.setMetadata('dateUpdated', date) intro = stripHTML(info.select_one('.inform-inform-txt').span) self.setDescription(self.url, intro) for content in soup.findAll('div', {'id': 'content'}): for a in content.findAll('a'): self.add_chapter(a.get_text(), a['href'])
def extractChapterUrlsAndMetadata(self): logger.debug('URL: %s', self.url) try: data = self._fetchUrl(self.url) except HTTPError as exception: if exception.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format( self.url)) raise exception soup = self.make_soup(data) info = soup.select_one('#info') self.story.setMetadata('title', stripHTML(info.h1)) self.setCoverImage(self.url, soup.select_one('#fmimg > img')['src']) info_paragraphs = info('p') # Unicode strings because ':' isn't ':', but \xef\xbc\x9a author = stripHTML(info_paragraphs[0]).replace(u'Author:', '', 1) self.story.setMetadata('author', author) self.story.setMetadata('authorId', author) datestr = stripHTML(info_paragraphs[2]).replace(u'UpdateTime:', '', 1) date = None try: ## Some older stories use a different date format. date = makeDate(datestr, self.NEW_DATE_FORMAT) except ValueError: date = makeDate(datestr, self.OLD_DATE_FORMAT) if date: self.story.setMetadata('dateUpdated', date) intro = soup.select_one('#intro') # Strip <strong>Description</strong> if intro.strong: intro.strong.decompose() self.setDescription(self.url, intro) dl = soup.select_one('#list > dl') for el in dl.contents: if el.name == u'dt': match = re.match(ensure_text(r'^《.+》\s+(.+)$'), stripHTML(el), re.UNICODE) volume = '' if match and match.group(1) != 'Text': volume = match.group(1) + ' ' elif el.name == u'dd': a = el.a if a['style'] != 'color:Gray;': # skip grayed out "In preparation" chapters url = urlparse.urljoin(self.url, a['href']) title = volume + stripHTML(a) self.add_chapter(title, url)
def __init__(self,savedmeta): self.url = savedmeta.url self.count = savedmeta.count for k, v in savedmeta.meta.iteritems(): if k == 'description': v = stripHTML(v) setattr(self,k,v)
def __init__(self, savedmeta): self.url = savedmeta.url self.count = savedmeta.count for k, v in savedmeta.meta.iteritems(): if k == 'description': v = stripHTML(v) setattr(self, k, v)
def extractChapterUrlsAndMetadata(self): logger.debug('URL: %s', self.url) try: data = self._fetchUrl(self.url) except HTTPError as exception: if exception.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url)) raise exception soup = self.make_soup(data) info = soup.select_one('#info') self.story.setMetadata('title', stripHTML(info.h1)) self.setCoverImage(self.url, soup.select_one('#fmimg > img')['src']) info_paragraphs = info('p') # Unicode strings because ':' isn't ':', but \xef\xbc\x9a author = stripHTML(info_paragraphs[0]).replace(u'Author:', '', 1) self.story.setMetadata('author', author) self.story.setMetadata('authorId', author) datestr = stripHTML(info_paragraphs[2]).replace(u'UpdateTime:', '', 1) date = None try: ## Some older stories use a different date format. date = makeDate(datestr, self.NEW_DATE_FORMAT) except ValueError: date = makeDate(datestr, self.OLD_DATE_FORMAT) if date: self.story.setMetadata('dateUpdated', date) intro = soup.select_one('#intro') # Strip <strong>Description</strong> if intro.strong: intro.strong.decompose() self.setDescription(self.url, intro) dl = soup.select_one('#list > dl') for el in dl.contents: if el.name == u'dt': match = re.match(ensure_text(r'^《.+》\s+(.+)$'), stripHTML(el), re.UNICODE) volume = '' if match and match.group(1) != 'Text': volume = match.group(1) + ' ' elif el.name == u'dd': a = el.a url = urlparse.urljoin(self.url, a['href']) title = volume + stripHTML(a) self.add_chapter(title, url)
def extractChapterUrlsAndMetadata(self): logger.debug('URL: %s', self.url) try: data = self._fetchUrl(self.url) except HTTPError as exception: if exception.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url)) raise exception soup = self.make_soup(data) info = soup.select_one('#info') self.story.setMetadata('title', stripHTML(info.h1)) self.setCoverImage(self.url, soup.select_one('#fmimg > img')['src']) info_paragraphs = info('p') # Unicode strings because ':' isn't ':', but \xef\xbc\x9a author = stripHTML(info_paragraphs[0]).replace(u'Author:', '', 1) self.story.setMetadata('author', author) self.story.setMetadata('authorId', author) datestr = stripHTML(info_paragraphs[2]).replace(u'UpdateTime:', '', 1) date = None try: ## Some older stories use a different date format. date = makeDate(datestr, self.NEW_DATE_FORMAT) except ValueError: date = makeDate(datestr, self.OLD_DATE_FORMAT) if date: self.story.setMetadata('dateUpdated', date) intro = soup.select_one('#intro') # Strip <strong>Description</strong> intro.strong.decompose() self.setDescription(self.url, intro) for a in soup.select('#list a'): url = urlparse.urljoin(self.url, a['href']) title = stripHTML(a) self.add_chapter(title, url)
def extractChapterUrlsAndMetadata(self): logger.debug('URL: %s', self.url) try: data = self._fetchUrl(self.url) except HTTPError as exception: if exception.code == 404: raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url)) raise exception soup = self.make_soup(data) self.setCoverImage(self.url, soup.select_one('.book-img > img')['src']) book_info = soup.select_one('.book-info') author = book_info.select_one('.author > .name').get_text() self.story.setMetadata('title', book_info.select_one('.book-name').get_text()) self.story.setMetadata('author', author) self.story.setMetadata('authorId', author) ## site doesn't have authorUrl links. ## getting status status = stripHTML(soup.select_one('div.book-state > span.txt')) if status == 'Completed': self.story.setMetadata('status', 'Completed') else: self.story.setMetadata('status', 'In-Progress') chapter_info = soup.select_one('.chapter-wrapper') date = makeDate(chapter_info.select_one('.update-time').get_text(), self.DATE_FORMAT) if date: self.story.setMetadata('dateUpdated', date) intro = soup.select_one('.synopsis > .content') if intro.strong: intro.strong.decompose() self.setDescription(self.url, intro) ## skip grayed out "In preparation" chapters -- couldn't make ## the :not() work in the same select. chapters = [ ch for ch in chapter_info.select('.chapter-item') if not (ch.has_attr('style') and 'color:Gray;' not in ch('style')) ] if self.getConfig("dedup_order_chapter_list",False): # Sort and deduplicate chapters (some stories in incorrect order and/or duplicates) chapters_data = [] numbers_regex = re.compile(r'[^0-9\.]') # Everything except decimal and numbers for ch in chapters: chapter_title = ch.p.get_text() chapter_url = ch['href'] if chapter_title.startswith('Chapter'): target_number = chapter_title.split()[1] else: target_number = chapter_title.split()[0] try: number = float(re.sub(numbers_regex, '', target_number)) except: continue # Cannot parse chapter number chapters_data.append((number, chapter_title, chapter_url)) chapters_data.sort(key=lambda ch: ch[0]) for index, chapter in enumerate(chapters_data): if index > 0: # No previous duplicate chapter names or same chapter numbers if chapter[1] == chapters_data[index-1][1] or chapter[0] == chapters_data[index-1][0]: continue title = chapter[1] url = urlparse.urljoin(self.url, chapter[2]) self.add_chapter(title, url) else: ## normal operation for ch in chapters: self.add_chapter(ch.p.get_text(), urlparse.urljoin(self.url, ch['href']))