def _getGutenbergSpiegelChapters(pages): chapters = [] for page in pages: soup = BeautifulSoup(page) chapter = Library.BookChapter() titleElement = soup.find('div', attrs={'id': 'gutenb'}).find('h3') if titleElement != None: chapter.Title = ''.join(titleElement.strings) paragraphElements = soup.find('div', attrs={ 'id': 'gutenb' }).find_all('p') for paragraphElement in paragraphElements: paragraph = Library.BookParagraph() paragraphString = ''.join(paragraphElement.strings) if paragraphString != None: sentences = paragraphString.replace('!', '.').replace( '?', '.').split('.') paragraph.Sentences = [ s.strip() for s in sentences if s.strip() != '' ] if len(paragraph.Sentences) != 0: chapter.Paragraphs.append(paragraph) if len(chapter.Paragraphs) != 0 and chapter.Title != None: chapters.append(chapter) return chapters
def _getLowereadBookChapters(pages): chapters = [] currentChapter = Library.BookChapter() currentChapter.Title = '' currentChapter.Paragraphs = [] for page in pages: soup = BeautifulSoup(page) readBookElement = soup.find('td', attrs={'class': 'tb_read_book'}) if readBookElement != None: pageContentElement = readBookElement.find( 'div', attrs={ 'class': 'MsoNormal' }).find('p', attrs={'class': 'MsoNormal'}) for child in pageContentElement.find_all(re.compile('p|div')): if child.name == 'div': if child.string != None and child.string.strip() != '': if len(currentChapter.Paragraphs) != 0: chapters.append(currentChapter) currentChapter = Library.BookChapter() currentChapter.Title = child.string if child.name == 'p': paragraphString = ''.join(child.strings) if paragraphString.strip() != '': paragraphString = paragraphString.replace( os.linesep, ' ') paragraph = Library.BookParagraph() paragraph.Sentences = [] sentences = paragraphString.replace('!', '.').replace( '?', '.').split('.') sentences = [s.strip() for s in sentences] for sentence in sentences: if sentence != "": paragraph.Sentences.append(sentence) if len(paragraph.Sentences) != 0: currentChapter.Paragraphs.append(paragraph) return chapters
def _getReadcentralBookChapter(path): chapter = Library.BookChapter() chapter.Paragraphs = [] html = HtmlDownloader.DownloadHtml('www.readcentral.com', path) soup = BeautifulSoup(html) pageheadElem = soup.find('div', attrs={'id': 'pagehead'}) chapter.Title = pageheadElem.div.string.strip() contentElement = soup.find( 'div', attrs={'id': 'ctl00_contents_book_chapter_content_area'}) for paragraphElement in contentElement.find_all('p'): paragraphString = ''.join(paragraphElement.strings) #paragraphString = re.sub('<[^>]+>', '', paragraphString) if paragraphString != None: paragraph = Library.BookParagraph() paragraph.Sentences = getEnSentencesFromParagraphString( paragraphString) if len(paragraph.Sentences) != 0: chapter.Paragraphs.append(paragraph) return chapter