def __init__(self, file_path: str, num_of_chapters: int): self.num_of_chapters = num_of_chapters path, file_name = os.path.split(file_path) super().__init__(file_name, num_of_chapters) book = epub.open_epub(file_path) self.chapters = self.extract_chapters(book)
def get_epub_info(filepath): book = epub.open_epub(filepath) contents = [] for item in book.opf.manifest.values(): # read the content contents.append(item.href) return contents
def make_book_text(): book = epub.open_epub("book.epub") book_text = "" for item in book.opf.manifest.values(): soup = book.read_item(item) url = "" data = Article(url, language="en") data.set_html(book.read_item(item)) data.parse() book_text += data.text if not os.path.exists("Data_Input/5_Book_Text"): os.mkdir("Data_Input/5_Book_Text") with open("Data_Input/5_Book_Text/LittleWomen_00.bd", "wb") as dest_file: dest_file.write(book_text.encode("utf-8")) subprocess.call([ "wine", "narctool.exe", "p", "Data_Input/5_Book_Text", "Data_Input/5_Book_Text.narc" ])
def __init__(self, epub_path): self.epub_file = open_epub(epub_path, 'r') # current item used for navigation self.current_item = None # soup for the current item self.item_data_soup = None
def parse_epub(_file): """Parse the epub file and return a dictionary containing information about the magazine and articles""" data = { 'title': '', 'articles': [] } try: magazine = epub.open_epub(_file) except BadZipfile: return data # get the title of the magazine data['title'] = magazine.toc.title for item_id, _ in magazine.opf.spine.itemrefs: item = magazine.get_item(item_id) contents = magazine.read_item(item) soup = BeautifulSoup(contents) article = {} title = find_article_title(soup) author = find_article_author(soup) content = find_article_content(soup) if all([title, author, content]): article['title'] = title article['author'] = author article['content'] = content data['articles'].append(article) return data
def parse_epub(_file): """Parse the epub file and return a dictionary containing information about the magazine and articles""" data = {'title': '', 'articles': []} try: magazine = epub.open_epub(_file) except BadZipfile: return data # get the title of the magazine data['title'] = magazine.toc.title for item_id, _ in magazine.opf.spine.itemrefs: item = magazine.get_item(item_id) contents = magazine.read_item(item) soup = BeautifulSoup(contents) article = {} title = find_article_title(soup) author = find_article_author(soup) content = find_article_content(soup) if all([title, author, content]): article['title'] = title article['author'] = author article['content'] = content data['articles'].append(article) return data
def epub_to_corpus(mixed, as_dict=False): ''' mixed: a filepath or an epub object ''' if isinstance(mixed, epub.EpubFile): book = mixed elif isinstance(mixed, basestring): book = epub.open_epub(mixed) else: raise TypeError("don't know how to handle %s of type %s" % (mixed, type(mixed))) # RETURNS: corpus = [] valid_klist = [] # we only want page/document content p_page = re.compile(r'(.*?)\.(html?)$') # to separate the key from its numbering p_key = re.compile(r'^(\D+)(\d+)$') #page_prefix = sorted(prefix_counter, key=prefix_counter.get, reverse=True)[0] prefix_counter = Counter() bookdata = {} for key in book.opf.manifest.keys(): manifest_item = book.opf.manifest[key] match = p_page.match(manifest_item.href) if match: prefix = p_key.match(key).group(1) prefix_counter[prefix] += 1 bookdata[key] = book.read_item(manifest_item) page_prefix = sorted(prefix_counter, key=prefix_counter.get, reverse=True)[0] # strips out the leading 'html' for numbering keyfunc = lambda k: int(k[len(page_prefix):]) for key in sorted(bookdata.keys(), key=keyfunc): xml = bookdata[key] # it turns out the book data may be binary data, e.g. JPEG tree = etree.fromstring(xml) ## simple xpath won't work due to namespace prefixing ## either use this notation ## tree.xpath("//xhtml:style", namespaces={'xhtml': tree.nsmap[None]}) ## here, the None ns is actually the 'xhtml' ns ## though i think you can redefine it to whatever you want ## as opposed to 'xhtml' in the query ## or this for node in tree.xpath('''//*[local-name() = "style"]'''): node.getparent().remove(node) text = etree.tostring(tree, encoding = 'utf8', method = 'text') valid_klist.append(key) corpus.append(text) if as_dict: return dict(zip(valid_klist, corpus)) return corpus
def read_epub_html(path): data = '' with epub.open_epub(path) as book: opf_values = book.opf.manifest.values() for i in filter(lambda x: x.href.endswith('.html'), opf_values): bs = BeautifulSoup(str(book.read_item(i)), 'html.parser') data += bs.get_text() return data
def get_book_fields(path): book = epub.open_epub(path) chapterNames = [a.labels[0][0] for a in book.toc.nav_map.nav_point[0].nav_point] title = book.toc.title.split('(')[0] summary = get_description(book.toc.title) summary += 'This is a compilation of articles from Wikipedia about %s, formatted as an ebook for easy reading. Topics include:<br/>' % title chapterNames = sorted([a.split('(')[0] for a in chapterNames]) summary += '<br/>'.join(chapterNames) return (title, summary)
def get_linear_items_data(self, in_file_name): book_items = [] book = epub.open_epub(in_file_name) for item_id, linear in book.opf.spine.itemrefs: item = book.get_item(item_id) if linear: data = book.read_item(item) book_items.append(data) return book_items
def __init__(self, filePath, removeFileWhenComplete=False): EBookBase.__init__(self, filePath, removeFileWhenComplete) self.bookFile = None self.book = None try: self.bookFile = epub.open_epub(self.filePath) self.book = epub.Book(self.bookFile) except: log("EPubEBook: Failed to process eBook %s with error: %s" % (self.filePath, traceback.format_exc()), xbmc.LOGERROR)
def convert_epub_to_html(path_epub): file = open(path_epub.split('/')[0] + '/ck12-html/' + path_epub.split('/')[-1].split('.')[0] + '.html', 'w') book = epub.open_epub(path_epub) lst_item = book.opf.manifest.values() for item in lst_item: # read the content if PATTERN_HTML.match(item.href): data = book.read_item(item) file.write(data + '\n') file.close()
def get_page(self, filename, page): book = epub.open_epub(self._epub_data_directory + '/' + filename) data = '' for item_id, _ in book.opf.spine.itemrefs: item = book.get_item(item_id) data += book.read_item(item).decode("utf-8") data = remove_husk(data) sentences = nltk.sent_tokenize(data) pages = [sentences[x:x + self._sentences_per_page] for x in range(0, len(sentences), self._sentences_per_page)] return json.dumps( {"text": ' '.join(pages[page]), "pages": len(pages), "page": page})
def get_html(URL): for link in get_soup(URL).find_all('a'): file_link = link.get('href') if FILETYPE in file_link: print(file_link) r = requests.get(file_link, allow_redirects=True) open("hello.epub", 'wb').write(r.content) with epub.open_epub("hello.epub") as book: title = book.opf.metadata.titles[0][0] os.rename("hello.epub", title + ".epub") books.append([title + ".epub"])
def extract_content(self, filepath): book = epub.open_epub(filepath) sio = StringIO() for item in book.opf.manifest.values(): # read the content if item.media_type in ('application/xhtml+xml'): data = book.read_item(item) #sio.write(epub.utils.get_node_text(data)) sio.write(extract_html(data).encode('utf-8')) sio.write("\n") return to_utf8(sio.getvalue())
def parse_epub(path_to_epub: str) -> (str, str, list): doc = open_epub(path_to_epub) title = doc.opf.as_xml_document().getElementsByTagName( 'dc:title')[0].firstChild.nodeValue author = doc.opf.as_xml_document().getElementsByTagName( 'dc:creator')[0].firstChild.nodeValue text_by_lines = [] for item_id, linear in doc.opf.spine.itemrefs: item = doc.get_item(item_id) data = doc.read_item(item).decode() text_by_lines += html2text(data).split('\n') return title, author, text_by_lines
def read_book(bookfile): book = epub.open_epub(bookfile) compiled = '' for item in book.opf.manifest.values(): data = book.read_item(item) soup = bsoup(data, 'html.parser') for p in soup.find_all('p'): # text = p.get_text().encode('utf-8') text = p.get_text() # text = text.encode('ascii', errors='ignore').decode('ascii') text = ' ' + text + ' ' compiled += text return compiled
def process_epub(userParams): """ Parse manifest items using epub library """ book = epub.open_epub(userParams['input']) print "Parsing epub file at " + userParams['input'] itemHrefs = [] for item in book.opf.manifest.values(): itemHrefs.append(item.href) return itemHrefs
def get_text_from_epub_file(filename: str) -> str: book = epub.open_epub(filename) for item in book.opf.manifest.values(): try: text = book.read_item(item) except KeyError as k: continue try: text = text.decode('utf-8') except UnicodeDecodeError as u: pass book.close() return text
def process_epub(userParams): """ Parse manifest items using epub library """ book = epub.open_epub(userParams['input']) print ("Parsing epub file at " + userParams['input']) itemHrefs = [] for item in book.opf.manifest.values(): itemHrefs.append(item.href) return itemHrefs
def book(request, id=1): import epub book = Book.objects.get(pk=id) filename = settings._PATH + '\\media\\' + book.chapters.name print(filename) context = {'chapters': []} book_file = epub.open_epub(filename) for item_id, linear in book_file.opf.spine.itemrefs: item = book_file.get_item(item_id) data = book_file.read_item(item) context['chapters'].append(data) return render(request, 'book.html', context)
def processEpubFile(filename): epub_words = [] book = epub.open_epub(filename) for item in book.opf.manifest.values(): try: text = book.read_item(item) except KeyError as k: continue try: text = text.decode('utf-8') except UnicodeDecodeError as u: pass text = removeProper(text) text = removePunc(text) text = text.lower() epub_words += getWords(text) book.close() return epub_words
def read_epub_metadata(path): with epub.open_epub(path) as book: metadata = book.opf.metadata res = { 'description': empty_guard(metadata.description), 'subject': empty_guard(",".join(metadata.subjects)), 'publisher': empty_guard(metadata.publisher) } titles = list(chain.from_iterable(metadata.titles)) creators = list(chain.from_iterable(metadata.creators)) dates = list(chain.from_iterable(metadata.dates)) if len(titles) > 0: res['title'] = empty_guard(titles[0]) if len(creators) > 0: res['author'] = empty_guard(creators[0]) if len(dates) > 0: res['date'] = empty_guard(dates[0]) return res
def extract_isbn(filename): match = re.search('(\d{13}|\d{10})', filename) if match: return match.group() data_format = determine_format(filename) if data_format == '.pdf': command = ['pdftotext', '-f', '1', '-l', '10', filename, '-'] text = subprocess.check_output(command, stderr=subprocess.PIPE).decode(ENCODING) match = re.search('(?:[0-9]{3}-)?[0-9]{1,5}-[0-9]{1,7}-[0-9]{1,6}-[0-9]', text) if match: return match.group().replace('-', '') elif data_format == '.epub': book = epub.open_epub(filename) identifiers = ' '.join(book.opf.metadata.identifiers[0]) match = re.search('(\d{13}|\d{10})', identifiers) if match: return match.group() raise IsbnNotFoundError("ISBN not found in {}".format(filename))
def get_epub_author(book): '''Retrieve the book's author from an Epub file. @param book: the book informations containing the file name, path,... @return: a string containing the author if found else an error message ''' try: eb = epub.open_epub(books_path+'/'+book['file']) try: auth = eb.opf.metadata.creators[0][0].encode('utf-8') except: try: auth = eb.opf.metadata.creators[0][0].encode('ascii') except: auth = "error epub retrieve creator" eb.close() except: auth = "error epub open " + book['file'] return auth
def get_epub_description(book): '''Retrieve the book's description from an Epub file. @param book: the book informations containing the file name, path,... @return: a string containing the description if found else an error message ''' try: eb = epub.open_epub(books_path + '/' + book['file']) try: desc = eb.opf.metadata.description.encode('utf-8') except: try: desc = eb.opf.metadata.description.encode('ascii') except: desc = "error epub retrieve description" eb.close() except: desc = "error epub open "+books_path + '/' + book['file'] return desc
def main(): for act_path, dirs, files in os.walk("epubs"): for file_ in files: with epub.open_epub("epubs/"+file_) as book: try: os.mkdir(file_+"_folder") except OSError: print(file_+"_folder", "already exists, overwriting...") shutil.rmtree(file_+"_folder") print("epubs/"+file_) images = (name for name in book.namelist() if name.endswith("jpg") or name.endswith("png")) for imagename in images: imagename = imagename.replace("OEBPS/", "", 1)#epub, pls book.extract_item(imagename, to_path=file_+"_folder") if "cover" in imagename: print(imagename) os.rename(file_+"_folder/OEBPS/"+imagename, file_+"_folder/OEBPS/"+imagename.replace("cover", "001_cover")) os.rename(file_+"_folder/OEBPS/images", file_+"_folder/images") os.rmdir(file_+"_folder/OEBPS") os.system("python generate_cbz.py \""+file_+"_folder/images/\"") os.rename(file_+"_folder/images/.cbz", file_+"_folder/images/"+file_+".cbz")
def open_ebook(file, file_out=""): if file_out != "": f = open(file_out, encoding=encod2, mode='w') book = epub.open_epub(file) for item_id, linear in book.opf.spine.itemrefs: item = book.get_item(item_id) # Check if linear or not if linear: print('Linear item "%s"' % item.href) if check: data = book.read_item(item) data = data.decode(encod2, errors='ignore') #print(data) if file_out != "": f.write(data) else: print('Non-linear item "%s"' % item.href) if file_out != "": f.close()
def epub_init_doc(self): try: import epub except ImportError: self.Destroy() raise IOError('Install python-epub for EPUB files') else: import r_epub extract_path = os.path.join(os.path.expanduser('~/.rbook/cache/'), str(os.stat(self.filepath).st_ino)) zipfile.ZipFile(self.filepath, 'r').extractall(extract_path) self.book = epub.open_epub(self.filepath) self.extract_path = os.path.join(extract_path, os.path.dirname(self.book.opf_path)) chaps = self.book.opf.spine.itemrefs self.items = [self.book.opf.manifest[chap[0]].href for chap in chaps] self.n_pages = len(self.items) if not self.current_page_idx < self.n_pages: self.current_page_idx = self.n_pages - 1 os.chdir(self.extract_path) self.doc_scroll = r_epub.DocScroll(self, self.current_page_idx)
def __init__(self, name): self.regexImageSource = re.compile(r'''<img .*?src=['"](.*?)['"] ?.*?/>''') self.xhtml_template = u'''<?xml version='1.0' encoding='utf-8'?> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>{title}</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <!--<link href="../../stylesheet.css" type="text/css" rel="stylesheet"/>--> </head> <body> <h1>{title}</h1> {content} </body> </html>''' self.defaultMeta = { 'title': None, 'language': 'en', 'publisher': 'The MagPi LTD', 'dates': [], } self.book = epub.open_epub(name + '.epub', u'w') self.currentPlayOrder = 0
def __init__(self, name): self.regexImageSource = re.compile( r'''<img .*?src=['"](.*?)['"] ?.*?/>''') self.xhtml_template = u'''<?xml version='1.0' encoding='utf-8'?> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>{title}</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <!--<link href="../../stylesheet.css" type="text/css" rel="stylesheet"/>--> </head> <body> <h1>{title}</h1> {content} </body> </html>''' self.defaultMeta = { 'title': None, 'language': 'en', 'publisher': 'The MagPi LTD', 'dates': [], } self.book = epub.open_epub(name + '.epub', u'w') self.currentPlayOrder = 0
import epub import re import nltk from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize from nltk.stem.wordnet import WordNetLemmatizer lemmatizer = WordNetLemmatizer() from SentiWordNet import SentiWordNet WINDOW_SIZE = 5000 BOOK = "pg730" senti = SentiWordNet.SentiWordNet() sentiment = senti.reduce() book = epub.open_epub('/home/yves/Documents/projects/happyEndings/'+BOOK+'.epub') allSentences = [] w = 0 for item in book.opf.manifest.values(): # read the content data = book.read_item(item) for line in data.split("\n"): if re.search("^<p", line): line = re.sub("<.*?>","", line) sentences = [word_tokenize(t) for t in sent_tokenize(line)] for sentence in sentences: allSentences.append(sentence) w += len(line.split()) scores = []
#!/usr/bin/python import epub book = epub.open_epub('rpo.epub') for item in book.opf.manifest.values(): # read the content data = book.read_item(item) # get chapter 1 print data import xml.etree.ElementTree as ET tree = ET.fromstring(data) print tree.findall('content') # item = book.get_item('chapter') # # display the same thing # print book.read_item(item)
def getEpub(): import epub path = 'C:\\Users\\pdavio\\Desktop\\DigitalHumanities\\wasteland\\1321-0.epub' book = epub.open_epub(path)
#Authors available for training authorList=[pathi.split("/")[-1] for pathi in [x[0] for x in os.walk(path)][1:]] testList=[pathi.split("/")[-1] for pathi in [x[0] for x in os.walk(pathValidation)][1:]] nauthors=len(authorList) #Loop through all the folders present in path for pathi in [x[0] for x in os.walk(path)][1:]+[x[0] for x in os.walk(pathValidation)][1:]: authorFeatures=[] #Get all epub books from the current directory books=[file for file in os.listdir(pathi) if file.endswith(".epub")] os.chdir(pathi) #Loop on all books from that directory for bookName in books: #Open the epub book as an object thanks to the epub package book = epub.open_epub(bookName) #Get all chapters "names" from the book chapterNames = [book.get_item(item_id).href for item_id, linear in book.opf.spine.itemrefs] chapterNames = [ch for ch in chapterNames if ch[:2]=='ch'] bookFeaturesi=[] #Loop on the chapter in current book for ch in chapterNames: tmp=functions.extractFeatures(book,ch,minstring,comWords) if tmp==False: continue bookFeaturesi.append(tmp) authorFeatures.append(bookFeaturesi) if path in pathi: trainFeatures.append(authorFeatures) if pathValidation in pathi: testFeatures.append(authorFeatures)
__author__ = 'mak' import epub from bs4 import BeautifulSoup book = epub.open_epub('TFASbook.epub') epubNavPointsList = book.toc.nav_map.nav_point for point in epubNavPointsList: print('%s. %s'%(point.play_order, point.labels[0][0])) chosenChapter = int(input("Give me the chapter: ")) curPoint = epubNavPointsList[chosenChapter - 1] srcOfFile = curPoint.src # this srcOfFile can contain #filepos attr at the end of string # and we should remove it position = srcOfFile.find('#') srcOfFile = srcOfFile[:position] curItem = book.get_item_by_href(srcOfFile) readedItemB = book.read_item(curItem) readedItem = readedItemB.decode('utf-8', 'ignore') soup = BeautifulSoup(readedItem) for script in soup(["script", "style"]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) textWithoutDots = text.translate({ord(i):None for i in '1234567“890:"!,.?/'}) listOfAllwords = textWithoutDots.split(' ') print('There are %s words in text.'% len(listOfAllwords))
def open_book(path): try: return open_epub(path) except (BadEpubFile, BadZipFile, KeyError, IndexError): return None
# -*- coding: utf-8 -*- __author__ = 'User' import epub import re from xml.etree import ElementTree from cStringIO import StringIO class AllEntities: def __getitem__(self, key): # key is your entity, you can do whatever you want with it here if key == "nbsp": return "" return key book = epub.open_epub('book.epub') first_file_name = book.toc.nav_map.nav_point[4].src print first_file_name chapter = book.open(first_file_name) # Open from zip string = chapter.read() print(string[:75]) first_file = StringIO(string) parser = ElementTree.XMLParser(); parser.parser.UseForeignDTD(True); # because ElementTree chokes on parser.entity = AllEntities() etree = ElementTree.ElementTree() tree = etree.parse(first_file, parser=parser)
book_solr_connection = pysolr.Solr('http://localhost:8983/solr/book', timeout=10) cover_solr_connection = pysolr.Solr('http://localhost:8983/solr/cover', timeout=10) epub_data_directory = 'all_data' text_data_directory = 'text' sentences_per_page = 21 data_directory_files = os.listdir(epub_data_directory) epub_directory_files = filter(lambda x: x.endswith('.epub'), data_directory_files) if not os.path.exists(text_data_directory): os.makedirs(text_data_directory) for epub_file in epub_directory_files: print(epub_file) book = epub.open_epub(epub_data_directory + '/' + epub_file) date, creator, title = '', '', '' if len(book.opf.metadata.dates) > 0: date, _ = book.opf.metadata.dates[0] if date not in ['', 'NONE']: date = parse(date, fuzzy=True).year else: date = '' if len(book.opf.metadata.creators) > 0: creator, _, _ = book.opf.metadata.creators[0] if len(book.opf.metadata.titles) > 0: title, _ = book.opf.metadata.titles[0] description = book.opf.metadata.description if not description: description = '' cover_solr_connection.add([{
from collections import Counter, OrderedDict from base64 import b64encode import argparse from zipfile import ZipFile parser = argparse.ArgumentParser(description="Convert epub books to JSON") parser.add_argument('epub_source', help='the epub document to convert') parser.add_argument('destination', help='file name to write JSON to') parser.add_argument('--binary-zip', dest='zipfile', help='instead of base64 encoding binary data, write it to this zip file instead') args = parser.parse_args() book = epub.open_epub(args.epub_source) # Extend BadgerFish rules (http://badgerfish.ning.com/) to # - add tail properties from lxml as $2 # - removes stripping of whitespace around text as this may be needed # when displaying the book class ExtBadgerFish(BadgerFish): def data(self, root): 'Convert etree.Element into a dictionary' value = self.dict() children = [node for node in root if isinstance(node.tag, basestring)] for attr, attrval in root.attrib.items(): attr = attr if self.attr_prefix is None else self.attr_prefix + attr value[attr] = self._convert(attrval) if root.text and self.text_content is not None: text = root.text.strip() if text:
def setUp(self): self.epub_file = epub.open_epub(self.epub_path) self.book = epub.Book(self.epub_file)
def setUp(self): self.epub_file = epub.open_epub(self.epub_path)
def open_book(path): try: return epub.open_epub(path) except: print 'Can`t open', path
set_mode((pygame.display.Info().current_w, \ pygame.display.Info().current_h), FULLSCREEN) ''' ##### CONSTANTS ##### TIME_INTERVAL = set_word_speed() # Converts WPM to decimal seconds. FILE = choose_file() # Choose EPUB file. CONTENTS = [] # Contents of e-book (raw). To process before displaying. PUNCTUATION_END = '.,?!:;}])\'">~*+=-%' PUNCTUATION_START = '([{\'"<~*=-$%#@' ENGLISH_DICTIONARY = enchant.Dict("en_US") ##### Add raw data to CONTENTS ##### book = epub.open_epub(FILE) for item in book.opf.manifest.values(): CONTENTS.append(book.read_item(item)) ######## Begin program ############# for sentence in CONTENTS: for candidate_word in sentence.split(): try: word = candidate_word.decode("utf-8") if ENGLISH_DICTIONARY.check(word): time.sleep(TIME_INTERVAL) print(word) elif word[0] in PUNCTUATION_START and word[-1] not in PUNCTUATION_END \ and ENGLISH_DICTIONARY.check(word[1:]): time.sleep(TIME_INTERVAL)
def get_item(path_epub): book = epub.open_epub(path_epub) for item in book.opf.manifest.values() : print item.href
text.append(paragraph['#text']) elif 'span' in paragraph and '#text' in paragraph['span']: text.append(paragraph['span']['#text']) # some unicode symbols can be replaced by ascii-compatible characters # this helps with the NLP later text = [x.replace(u'\u00ad', '') # word-wrap hyphens .replace(u'\u2019', "'") .replace(u'\u201c', '"') # opening quotes .replace(u'\u201d', '"') # ending quotes .replace(u'\u2014', '-') .replace(u'\u2026', '...') for x in text] return text file = sys.argv[1] book = epub.open_epub(file) db_conn = sqlite3.connect('data.db') c = db_conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS sentences ( sentence_id INT, sentence TEXT, sentence_index INT, paragraph INT, chapter INT, title TEXT, PRIMARY KEY (sentence_id, title) ) ''')
def __init__(self, filename): assert isinstance(filename, unicode) self.pub = epub.open_epub(filename)
# bk.add_item(c1); bk.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),(epub.Section('Simple bk'), (c1, ))) # # add chapter; define Table Of Contents # bk.add_item(epub.EpubNcx()); bk.add_item(epub.EpubNav()) # add default NCX and Nav file # style = 'BODY {color: white;}'; nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # define CSS style # bk.add_item(nav_css); bk.spine = ['nav', c1]; # add CSS file; basic spine # epub.write_epub('test.epub', bk, {}) # write to the file #--- import re import epub as ep from ebooklib import epub # http://epub.exirel.me/genindex.html dir1 = 'D:/scan0/lgl/' for fnm in [ 'a_20160500_20130800.epub', 'b_20130115_2008.epub', 'c_2007_2002.epub', 'd_2002_1991.epub', 'LexisNexis_201605_1991.epub' ]: # kk=0; bk = ep.open_epub(dir1 + 'LN_aside/' + fnm) o = [q for q in bk.namelist() if -1 < q.find('index')] o1 = bk.read_item(o[0]) for k in range(1, len(o)): q = bk.read_item(o[k]) o1 = o1[:o1.rfind('</body></html>' )] + q[q.find('<body class="calibre">') + 23:] x = [k.start() for k in re.finditer('\d* of \d* DOCUMENTS', o1) ] + [o1.rfind('</body></html>')] # create bk2 bk2 = epub.EpubBook() bk2.set_identifier('LN') bk2.add_author('u') bk2.set_title(fnm[:-5]) bk2.set_language('en') bk2.add_item(
def __repr__(self): spl = self.content.split()[:10] if len(self.content) > 200: return ' '.join(spl[:20]) + '...' else: return ' '.join(spl[:20]) IndexRange = namedtuple('IndexRange', ['len', 'beg', 'end']) if __name__ == '__main__': filepath = sys.argv[-2] bookpath = sys.argv[-1] book = epub.open_epub(bookpath) mydc = dict((key, book.read_item(item)) for key, item in book.opf.manifest.items()) # compile the list of notes note_list = [] for line_orig in open(filepath).readlines(): if line_orig.lstrip().startswith('Add a note'): continue line_type = None # preprocessing line = line_orig.replace('• Delete this highlight', '').strip() if not line: continue if line.startswith('Note: ') and line.endswith('Edit'): line_type = KindleNote.NOTE
def __repr__(self): spl = self.content.split()[:10] if len(self.content) > 200: return ' '.join(spl[:20]) + '...' else: return ' '.join(spl[:20]) IndexRange = namedtuple('IndexRange', ['len', 'beg', 'end']) if __name__ == '__main__': filepath = sys.argv[-2] bookpath = sys.argv[-1] book = epub.open_epub(bookpath) mydc = dict( (key, book.read_item(item)) for key, item in book.opf.manifest.items()) # compile the list of notes note_list = [] for line_orig in open(filepath).readlines(): if line_orig.lstrip().startswith('Add a note'): continue line_type = None # preprocessing line = line_orig.replace('• Delete this highlight', '').strip() if not line: continue if line.startswith('Note: ') and line.endswith('Edit'):
# from sys import argv from os import listdir from epub import open_epub filelist = listdir(".") if len(argv) > 1: for filename in filelist: if filename.endswith(".epub"): book = open_epub(filename, "r") print filename[:-5], ":", book.opf.metadata.titles[0][0] book.close() else: for filename in filelist: if filename.endswith(".epub"): book = open_epub(filename, "a") book.opf.metadata.titles = [(filename[:-5].decode('utf8'), u"")] book.close()
def epub_to_txt( epub_file_name, file_dir="epub-files", output_file_dir="txt-files", chapter_files_dir=None, debug=False, dry_run=False): if chapter_files_dir is None: chapter_files_dir = os.path.join(output_file_dir, CHAPTERS) _try_mkdirs(output_file_dir) _try_mkdirs(chapter_files_dir) html_to_text = html2text.HTML2Text() html_to_text.ignore_links = True # Ignore hidden files if epub_file_name[0] == '.': return # Ignore files that don't have the epub extension if os.path.splitext(epub_file_name)[1] != ".epub": return print("Opening file: %s" % epub_file_name) ebook = epub.open_epub(os.path.join(file_dir, epub_file_name)) book_title = ebook.toc.title print("Starting on book: %s" % book_title) # Works with Expanse, old code """ play_order = [nav_point.play_order for nav_point in ebook.toc.nav_map.nav_point] play_order_labels = [str(nav_point.play_order) for nav_point in ebook.toc.nav_map.nav_point] labels = [nav_point.labels[0][0] for nav_point in ebook.toc.nav_map.nav_point] source_references = [nav_point.src for nav_point in ebook.toc.nav_map.nav_point] """ play_order = list() play_order_labels = list() labels = list() source_references = list() def get_all_nav_points(nav_point): if len(nav_point.nav_point) == 0: return [nav_point] else: nav_points_list = [nav_point] for sub_nav_point in nav_point.nav_point: nav_points_list += get_all_nav_points(sub_nav_point) return nav_points_list for nav_point_root in ebook.toc.nav_map.nav_point: for nav_point in get_all_nav_points(nav_point_root): play_order.append(nav_point.play_order) play_order_labels.append(str(nav_point.play_order)) labels.append(nav_point.labels[0][0]) source_references.append(nav_point.src) #""" play_order_label_to_index = dict([(x[0], index) for index, x in enumerate(ebook.opf.spine.itemrefs)]) play_order_labels = [x[0] for x in ebook.opf.spine.itemrefs] play_order = [str(i) for i in range(len(ebook.opf.spine.itemrefs))] labels = sorted( list( filter( lambda x: x in play_order_label_to_index, ebook.opf.manifest.keys())), key=lambda x: play_order_label_to_index[x]) source_references = list( map( lambda x: x[1], sorted( list(filter(lambda x: x[0] in play_order_label_to_index, ebook.opf.manifest.items())), key=lambda x: play_order_label_to_index[x[0]]))) #""" if debug: print("play_order:\n'%s'\n\n" % str(play_order)) print("play_order_labels:\n'%s'\n\n" % str(play_order_labels)) # print("play_order_label_to_index dict:\n'%s'\n\n" % str(play_order_label_to_index)) print("labels:\n'%s'\n\n" % str(labels)) print("source_references:\n'%s'\n\n" % str(source_references)) assert len(labels) == len(source_references) and len(labels) == len(play_order_labels), ( "Not true that: len(labels): '%d' == len(source_references): '%d' and len(labels): '%d' == len(play_order_labels): '%d'" % (len(labels), len(source_references), len(labels), len(play_order_labels))) chapter_label_source_tuples = list(zip(play_order, play_order_labels, labels, source_references)) if debug: print("chapter_label_source_tuples:\n%s\n\n" % "\n".join(list(map(lambda x: str(x), chapter_label_source_tuples)))) full_book_content = list() for chapter_order, chapter_order_label, chapter_title, source_ref in chapter_label_source_tuples: chapter_info_string = "Book: %s Chapter: %s titled: %s"\ % (book_title, chapter_order, chapter_title) try: chapter_content = ebook.read_item(source_ref) if debug: print("chapter_order: '%s', chapter_order_label: '%s', chapter_title: '%s', source_ref: '%s', read source_ref:\n'%s'\n\n" % ( chapter_order, str(chapter_order_label), str(chapter_title), str(source_ref), str(ebook.read_item(source_ref)[:min(20, len(ebook.read_item(source_ref)))]))) except Exception as e: print("Exception: e: '%s'" % str(e)) if isinstance(e, KeyError): print("KeyError: e: '%s'" % str(e)) sys.stdout.flush() if ".jpg" in str(e): continue else: raise e print("Failed getting chapter: %s %s in book %s, exception: %s" % (chapter_order, chapter_title, ebook.toc.title, str(e))) ref_fixed = re.sub("#.*", "", source_ref) try: chapter_content = ebook.read_item(ref_fixed) print("Success on retry! %s" % chapter_info_string) except: print("FAILED ON RETRY TOO for book titled: %s with ref: %s." % (book_title, ref_fixed)) try: string_chapter_content = chapter_content.decode("utf-8") except UnicodeDecodeError as e: print("TypeError while decoding content with UTF-8 on chapter titled: '%s'" % chapter_title) continue chapter_content = html_to_text.handle(str(string_chapter_content)) full_book_content.append((chapter_order, chapter_title, chapter_content)) with open(os.path.join(output_file_dir, os.path.splitext(epub_file_name)[0] + ".txt"), "w") as txt_file: for chapter_index, chapter_tuple in enumerate(full_book_content): order = chapter_tuple[0] if order.strip() == "": order = str(chapter_index) title = chapter_tuple[1] content = chapter_tuple[2] if not dry_run: txt_file.write(content) chapter_file_name = epub_file_name.replace(".epub", "") chapter_file_name += "--" + order.zfill(5) + "--" + title chapter_file_name += ".txt" if not dry_run: with open(os.path.join(chapter_files_dir, chapter_file_name), "w") as chapter_txt_file: chapter_txt_file.write(content) ebook.close()