Example #1
0
    def __init__(self, file_path: str, num_of_chapters: int):
        self.num_of_chapters = num_of_chapters
        path, file_name = os.path.split(file_path)
        super().__init__(file_name, num_of_chapters)

        book = epub.open_epub(file_path)
        self.chapters = self.extract_chapters(book)
Example #2
0
def get_epub_info(filepath):
    book = epub.open_epub(filepath)
    contents = []
    for item in book.opf.manifest.values():
        # read the content
        contents.append(item.href)
    return contents
Example #3
0
def make_book_text():
    book = epub.open_epub("book.epub")

    book_text = ""

    for item in book.opf.manifest.values():
        soup = book.read_item(item)

        url = ""

        data = Article(url, language="en")
        data.set_html(book.read_item(item))

        data.parse()

        book_text += data.text

    if not os.path.exists("Data_Input/5_Book_Text"):
        os.mkdir("Data_Input/5_Book_Text")

    with open("Data_Input/5_Book_Text/LittleWomen_00.bd", "wb") as dest_file:
        dest_file.write(book_text.encode("utf-8"))

    subprocess.call([
        "wine", "narctool.exe", "p", "Data_Input/5_Book_Text",
        "Data_Input/5_Book_Text.narc"
    ])
Example #4
0
    def __init__(self, epub_path):
        self.epub_file = open_epub(epub_path, 'r')

        # current item used for navigation
        self.current_item = None
        # soup for the current item
        self.item_data_soup = None
Example #5
0
def parse_epub(_file):
    """Parse the epub file and return a dictionary containing
    information about the magazine and articles"""
    data = {
        'title': '',
        'articles': []
    }
    try:
        magazine = epub.open_epub(_file)
    except BadZipfile:
        return data

    # get the title of the magazine
    data['title'] = magazine.toc.title 

    for item_id, _ in magazine.opf.spine.itemrefs:
        item = magazine.get_item(item_id)
        contents = magazine.read_item(item)
        soup = BeautifulSoup(contents)
        article = {}

        title = find_article_title(soup)
        author = find_article_author(soup)
        content = find_article_content(soup)

        if all([title, author, content]):
            article['title'] = title
            article['author'] = author
            article['content'] = content

            data['articles'].append(article)

    return data
Example #6
0
def parse_epub(_file):
    """Parse the epub file and return a dictionary containing
    information about the magazine and articles"""
    data = {'title': '', 'articles': []}
    try:
        magazine = epub.open_epub(_file)
    except BadZipfile:
        return data

    # get the title of the magazine
    data['title'] = magazine.toc.title

    for item_id, _ in magazine.opf.spine.itemrefs:
        item = magazine.get_item(item_id)
        contents = magazine.read_item(item)
        soup = BeautifulSoup(contents)
        article = {}

        title = find_article_title(soup)
        author = find_article_author(soup)
        content = find_article_content(soup)

        if all([title, author, content]):
            article['title'] = title
            article['author'] = author
            article['content'] = content

            data['articles'].append(article)

    return data
def epub_to_corpus(mixed, as_dict=False):
    '''
    mixed: a filepath or an epub object
    '''
    if   isinstance(mixed, epub.EpubFile):
        book = mixed
    elif isinstance(mixed, basestring):
        book = epub.open_epub(mixed)
    else:
        raise TypeError("don't know how to handle %s of type %s" % (mixed, type(mixed)))

    # RETURNS:
    corpus = []
    valid_klist = []

    # we only want page/document content
    p_page = re.compile(r'(.*?)\.(html?)$')
    # to separate the key from its numbering
    p_key = re.compile(r'^(\D+)(\d+)$')

    #page_prefix = sorted(prefix_counter, key=prefix_counter.get, reverse=True)[0]
    prefix_counter = Counter()

    bookdata = {}
    for key in book.opf.manifest.keys():
        manifest_item = book.opf.manifest[key]
        match = p_page.match(manifest_item.href)
        if match:
            prefix = p_key.match(key).group(1)
            prefix_counter[prefix] += 1
            bookdata[key] = book.read_item(manifest_item)

    page_prefix = sorted(prefix_counter, key=prefix_counter.get, reverse=True)[0]

    # strips out the leading 'html' for numbering
    keyfunc = lambda k: int(k[len(page_prefix):])
    for key in sorted(bookdata.keys(), key=keyfunc):
        xml = bookdata[key]

        # it turns out the book data may be binary data, e.g. JPEG
        tree = etree.fromstring(xml)

        ## simple xpath won't work due to namespace prefixing
        ## either use this notation
        ## tree.xpath("//xhtml:style", namespaces={'xhtml': tree.nsmap[None]})
        ## here, the None ns is actually the 'xhtml' ns
        ## though i think you can redefine it to whatever you want
        ## as opposed to 'xhtml' in the query
        ## or this
        for node in tree.xpath('''//*[local-name() = "style"]'''):
            node.getparent().remove(node)
        text = etree.tostring(tree, encoding = 'utf8', method = 'text')

        valid_klist.append(key)
        corpus.append(text)

    if as_dict:
        return dict(zip(valid_klist, corpus))
    return corpus
Example #8
0
def read_epub_html(path):
    data = ''
    with epub.open_epub(path) as book:
        opf_values = book.opf.manifest.values()
        for i in filter(lambda x: x.href.endswith('.html'), opf_values):
            bs = BeautifulSoup(str(book.read_item(i)), 'html.parser')
            data += bs.get_text()
    return data
Example #9
0
def get_book_fields(path):
	book = epub.open_epub(path)
	chapterNames = [a.labels[0][0] for a in book.toc.nav_map.nav_point[0].nav_point]
	title = book.toc.title.split('(')[0]
	summary = get_description(book.toc.title)
	summary += 'This is a compilation of articles from Wikipedia about %s, formatted as an ebook for easy reading.  Topics include:<br/>' % title
	chapterNames = sorted([a.split('(')[0] for a in chapterNames])
	summary += '<br/>'.join(chapterNames)
	return (title, summary)
Example #10
0
 def get_linear_items_data(self, in_file_name):
     book_items = []
     book = epub.open_epub(in_file_name)
     for item_id, linear in book.opf.spine.itemrefs:
         item = book.get_item(item_id)
         if linear:
             data = book.read_item(item)
             book_items.append(data)
     return book_items
Example #11
0
    def __init__(self, filePath, removeFileWhenComplete=False):
        EBookBase.__init__(self, filePath, removeFileWhenComplete)
        self.bookFile = None
        self.book = None

        try:
            self.bookFile = epub.open_epub(self.filePath)
            self.book = epub.Book(self.bookFile)
        except:
            log("EPubEBook: Failed to process eBook %s with error: %s" % (self.filePath, traceback.format_exc()), xbmc.LOGERROR)
Example #12
0
def convert_epub_to_html(path_epub):
    file = open(path_epub.split('/')[0] + '/ck12-html/' + path_epub.split('/')[-1].split('.')[0] + '.html', 'w')
    book = epub.open_epub(path_epub)
    lst_item = book.opf.manifest.values()
    for item in lst_item:
        # read the content
        if PATTERN_HTML.match(item.href):
            data = book.read_item(item)
            file.write(data + '\n')
    file.close()
Example #13
0
    def __init__(self, filePath, removeFileWhenComplete=False):
        EBookBase.__init__(self, filePath, removeFileWhenComplete)
        self.bookFile = None
        self.book = None

        try:
            self.bookFile = epub.open_epub(self.filePath)
            self.book = epub.Book(self.bookFile)
        except:
            log("EPubEBook: Failed to process eBook %s with error: %s" % (self.filePath, traceback.format_exc()), xbmc.LOGERROR)
Example #14
0
 def get_page(self, filename, page):
     book = epub.open_epub(self._epub_data_directory + '/' + filename)
     data = ''
     for item_id, _ in book.opf.spine.itemrefs:
         item = book.get_item(item_id)
         data += book.read_item(item).decode("utf-8")
     data = remove_husk(data)
     sentences = nltk.sent_tokenize(data)
     pages = [sentences[x:x + self._sentences_per_page] for x in range(0, len(sentences), self._sentences_per_page)]
     return json.dumps(
         {"text": ' '.join(pages[page]), "pages": len(pages), "page": page})
Example #15
0
def get_html(URL):
    for link in get_soup(URL).find_all('a'):
        file_link = link.get('href')
        if FILETYPE in file_link:
            print(file_link)
            r = requests.get(file_link, allow_redirects=True)
            open("hello.epub", 'wb').write(r.content)
            with epub.open_epub("hello.epub") as book:
                title = book.opf.metadata.titles[0][0]
            os.rename("hello.epub", title + ".epub")
            books.append([title + ".epub"])
Example #16
0
 def extract_content(self, filepath):
     book = epub.open_epub(filepath)
     sio = StringIO()
     for item in book.opf.manifest.values():
         # read the content
         if item.media_type in ('application/xhtml+xml'):
             data = book.read_item(item)
             #sio.write(epub.utils.get_node_text(data))
             sio.write(extract_html(data).encode('utf-8'))
             sio.write("\n")
     return to_utf8(sio.getvalue())
Example #17
0
 def extract_content(self, filepath):
     book = epub.open_epub(filepath)
     sio = StringIO()
     for item in book.opf.manifest.values():
         # read the content
         if item.media_type in ('application/xhtml+xml'):
             data = book.read_item(item)
             #sio.write(epub.utils.get_node_text(data))
             sio.write(extract_html(data).encode('utf-8'))
             sio.write("\n")
     return to_utf8(sio.getvalue())
Example #18
0
def parse_epub(path_to_epub: str) -> (str, str, list):
    doc = open_epub(path_to_epub)
    title = doc.opf.as_xml_document().getElementsByTagName(
        'dc:title')[0].firstChild.nodeValue
    author = doc.opf.as_xml_document().getElementsByTagName(
        'dc:creator')[0].firstChild.nodeValue
    text_by_lines = []
    for item_id, linear in doc.opf.spine.itemrefs:
        item = doc.get_item(item_id)
        data = doc.read_item(item).decode()
        text_by_lines += html2text(data).split('\n')
    return title, author, text_by_lines
Example #19
0
def read_book(bookfile):
    book = epub.open_epub(bookfile)
    compiled = ''
    for item in book.opf.manifest.values():
        data = book.read_item(item)
        soup = bsoup(data, 'html.parser')
        for p in soup.find_all('p'):
            # text = p.get_text().encode('utf-8')
            text = p.get_text()
            # text = text.encode('ascii', errors='ignore').decode('ascii')
            text = ' ' + text + ' '
            compiled += text
    return compiled
Example #20
0
def process_epub(userParams):
    """
        Parse manifest items using epub library
    """
    book = epub.open_epub(userParams['input'])

    print "Parsing epub file at " + userParams['input']

    itemHrefs = []
    for item in book.opf.manifest.values():
        itemHrefs.append(item.href)

    return itemHrefs 
Example #21
0
def get_text_from_epub_file(filename: str) -> str:
    book = epub.open_epub(filename)
    for item in book.opf.manifest.values():
        try:
            text = book.read_item(item)
        except KeyError as k:
            continue
        try:
            text = text.decode('utf-8')
        except UnicodeDecodeError as u:
            pass
    book.close()
    return text
Example #22
0
def process_epub(userParams):
    """
        Parse manifest items using epub library
    """
    book = epub.open_epub(userParams['input'])

    print ("Parsing epub file at " + userParams['input'])

    itemHrefs = []
    for item in book.opf.manifest.values():
        itemHrefs.append(item.href)

    return itemHrefs 
Example #23
0
def book(request, id=1):
    import epub
    book = Book.objects.get(pk=id)
    filename = settings._PATH + '\\media\\' + book.chapters.name
    print(filename)

    context = {'chapters': []}

    book_file = epub.open_epub(filename)
    for item_id, linear in book_file.opf.spine.itemrefs:
        item = book_file.get_item(item_id)
        data = book_file.read_item(item)

        context['chapters'].append(data)

    return render(request, 'book.html', context)
Example #24
0
def processEpubFile(filename):
    epub_words = []
    book = epub.open_epub(filename)
    for item in book.opf.manifest.values():
        try:
            text = book.read_item(item)
        except KeyError as k:
            continue
        try:
            text = text.decode('utf-8')
        except UnicodeDecodeError as u:
            pass
        text = removeProper(text)
        text = removePunc(text)
        text = text.lower()
        epub_words += getWords(text)
    book.close()
    return epub_words
Example #25
0
def read_epub_metadata(path):
    with epub.open_epub(path) as book:
        metadata = book.opf.metadata
        res = {
            'description': empty_guard(metadata.description),
            'subject': empty_guard(",".join(metadata.subjects)),
            'publisher': empty_guard(metadata.publisher)
        }
        titles = list(chain.from_iterable(metadata.titles))
        creators = list(chain.from_iterable(metadata.creators))
        dates = list(chain.from_iterable(metadata.dates))
        if len(titles) > 0:
            res['title'] = empty_guard(titles[0])
        if len(creators) > 0:
            res['author'] = empty_guard(creators[0])
        if len(dates) > 0:
            res['date'] = empty_guard(dates[0])
        return res
def extract_isbn(filename):
    match = re.search('(\d{13}|\d{10})', filename)
    if match:
        return match.group()

    data_format = determine_format(filename)
    if data_format == '.pdf':
        command = ['pdftotext', '-f', '1', '-l', '10', filename, '-']
        text = subprocess.check_output(command, stderr=subprocess.PIPE).decode(ENCODING)
        match = re.search('(?:[0-9]{3}-)?[0-9]{1,5}-[0-9]{1,7}-[0-9]{1,6}-[0-9]', text)
        if match:
            return match.group().replace('-', '')
    elif data_format == '.epub':
        book = epub.open_epub(filename)
        identifiers = ' '.join(book.opf.metadata.identifiers[0])
        match = re.search('(\d{13}|\d{10})', identifiers)
        if match:
            return match.group()

    raise IsbnNotFoundError("ISBN not found in {}".format(filename))
Example #27
0
def get_epub_author(book):
    '''Retrieve the book's author from an Epub file.

    @param book: the book informations containing the file name, path,...
    @return: a string containing the author if found else an error message

    '''
    try:
        eb = epub.open_epub(books_path+'/'+book['file'])
        try:
            auth = eb.opf.metadata.creators[0][0].encode('utf-8')
        except:
            try:
                auth = eb.opf.metadata.creators[0][0].encode('ascii')
            except:
                auth = "error epub retrieve creator"
        eb.close()
    except:
        auth = "error epub open " + book['file']
    return auth
Example #28
0
def get_epub_description(book):
    '''Retrieve the book's description from an Epub file.

    @param book: the book informations containing the file name, path,...
    @return: a string containing the description if found else an error message

    '''
    try:
        eb = epub.open_epub(books_path + '/' + book['file'])
        try:
            desc = eb.opf.metadata.description.encode('utf-8')
        except:
            try:
                desc = eb.opf.metadata.description.encode('ascii')
            except:
                desc = "error epub retrieve description"
        eb.close()
    except:
        desc = "error epub open "+books_path + '/' + book['file']
    return desc
def main():
  for act_path, dirs, files in os.walk("epubs"):
    for file_ in files:
      with epub.open_epub("epubs/"+file_) as book:
        try:
          os.mkdir(file_+"_folder")
        except OSError:
          print(file_+"_folder", "already exists, overwriting...")
          shutil.rmtree(file_+"_folder")
        print("epubs/"+file_)
        images = (name for name in book.namelist() if name.endswith("jpg") or name.endswith("png"))
        for imagename in images:
          imagename = imagename.replace("OEBPS/", "", 1)#epub, pls
          book.extract_item(imagename, to_path=file_+"_folder")
          if "cover" in imagename:
            print(imagename)
            os.rename(file_+"_folder/OEBPS/"+imagename, file_+"_folder/OEBPS/"+imagename.replace("cover", "001_cover"))
        os.rename(file_+"_folder/OEBPS/images", file_+"_folder/images")
        os.rmdir(file_+"_folder/OEBPS")
        os.system("python generate_cbz.py \""+file_+"_folder/images/\"")
        os.rename(file_+"_folder/images/.cbz", file_+"_folder/images/"+file_+".cbz")
Example #30
0
def open_ebook(file, file_out=""):
    if file_out != "":
        f = open(file_out, encoding=encod2, mode='w')

    book = epub.open_epub(file)
    for item_id, linear in book.opf.spine.itemrefs:
        item = book.get_item(item_id)
        # Check if linear or not
        if linear:
            print('Linear item "%s"' % item.href)
            if check:
                data = book.read_item(item)
                data = data.decode(encod2, errors='ignore')
                #print(data)
                if file_out != "":
                    f.write(data)
        else:
            print('Non-linear item "%s"' % item.href)

    if file_out != "":
        f.close()
Example #31
0
    def epub_init_doc(self):
        try:
            import epub
        except ImportError:
            self.Destroy()
            raise IOError('Install python-epub for EPUB files')
        else:
            import r_epub

            extract_path = os.path.join(os.path.expanduser('~/.rbook/cache/'),
                                        str(os.stat(self.filepath).st_ino))
            zipfile.ZipFile(self.filepath, 'r').extractall(extract_path)
            self.book = epub.open_epub(self.filepath)
            self.extract_path = os.path.join(extract_path, 
                                             os.path.dirname(self.book.opf_path))
            chaps = self.book.opf.spine.itemrefs
            self.items = [self.book.opf.manifest[chap[0]].href for chap in chaps]
            self.n_pages = len(self.items)
            if not self.current_page_idx < self.n_pages:
                self.current_page_idx = self.n_pages - 1
            os.chdir(self.extract_path)                                       
            self.doc_scroll = r_epub.DocScroll(self, self.current_page_idx)
    def __init__(self, name):
        self.regexImageSource = re.compile(r'''<img .*?src=['"](.*?)['"] ?.*?/>''')
        self.xhtml_template = u'''<?xml version='1.0' encoding='utf-8'?>
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>{title}</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <!--<link href="../../stylesheet.css" type="text/css" rel="stylesheet"/>-->
  </head>
  <body>
    <h1>{title}</h1>
    {content}
  </body>
</html>'''
        self.defaultMeta = {
            'title': None,
            'language': 'en',
            'publisher': 'The MagPi LTD',
            'dates': [],
        }
        self.book = epub.open_epub(name + '.epub', u'w')
        self.currentPlayOrder = 0
    def __init__(self, name):
        self.regexImageSource = re.compile(
            r'''<img .*?src=['"](.*?)['"] ?.*?/>''')
        self.xhtml_template = u'''<?xml version='1.0' encoding='utf-8'?>
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>{title}</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <!--<link href="../../stylesheet.css" type="text/css" rel="stylesheet"/>-->
  </head>
  <body>
    <h1>{title}</h1>
    {content}
  </body>
</html>'''
        self.defaultMeta = {
            'title': None,
            'language': 'en',
            'publisher': 'The MagPi LTD',
            'dates': [],
        }
        self.book = epub.open_epub(name + '.epub', u'w')
        self.currentPlayOrder = 0
Example #34
0
import epub
import re
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from SentiWordNet import SentiWordNet

WINDOW_SIZE = 5000
BOOK = "pg730"

senti = SentiWordNet.SentiWordNet()
sentiment = senti.reduce()

book = epub.open_epub('/home/yves/Documents/projects/happyEndings/'+BOOK+'.epub')
allSentences = []

w = 0
for item in book.opf.manifest.values():
    # read the content
    data = book.read_item(item)
    for line in data.split("\n"):
        if re.search("^<p", line):
            line = re.sub("<.*?>","", line)
            sentences = [word_tokenize(t) for t in sent_tokenize(line)]
            for sentence in sentences:
                allSentences.append(sentence)
            w += len(line.split())

scores = []
Example #35
0
#!/usr/bin/python
import epub

book = epub.open_epub('rpo.epub')

for item in book.opf.manifest.values():
    # read the content
    data = book.read_item(item)
    # get chapter 1

print data
import xml.etree.ElementTree as ET

tree = ET.fromstring(data)
print tree.findall('content')

# item = book.get_item('chapter')
# # display the same thing
# print book.read_item(item)
Example #36
0
def getEpub():
    import epub
    path = 'C:\\Users\\pdavio\\Desktop\\DigitalHumanities\\wasteland\\1321-0.epub'
    book = epub.open_epub(path)
Example #37
0
#Authors available for training
authorList=[pathi.split("/")[-1] for pathi in [x[0] for x in os.walk(path)][1:]]
testList=[pathi.split("/")[-1] for pathi in [x[0] for x in os.walk(pathValidation)][1:]]
nauthors=len(authorList)

#Loop through all the folders present in path
for pathi in [x[0] for x in os.walk(path)][1:]+[x[0] for x in os.walk(pathValidation)][1:]:
    
    authorFeatures=[]
    #Get all epub books from the current directory
    books=[file for file in os.listdir(pathi) if file.endswith(".epub")]
    os.chdir(pathi)
    #Loop on all books from that directory
    for bookName in books:
        #Open the epub book as an object thanks to the epub package
        book = epub.open_epub(bookName)
        #Get all chapters "names" from the book
        chapterNames = [book.get_item(item_id).href for item_id, linear in book.opf.spine.itemrefs]
        chapterNames = [ch for ch in chapterNames if ch[:2]=='ch']
        bookFeaturesi=[]
        #Loop on the chapter in current book
        for ch in chapterNames:
            tmp=functions.extractFeatures(book,ch,minstring,comWords)
            if tmp==False:
                continue
            bookFeaturesi.append(tmp)
        authorFeatures.append(bookFeaturesi)
    if path in pathi:
        trainFeatures.append(authorFeatures)
    if pathValidation in pathi:
        testFeatures.append(authorFeatures)
Example #38
0
__author__ = 'mak'

import epub
from bs4 import BeautifulSoup

book = epub.open_epub('TFASbook.epub')
epubNavPointsList = book.toc.nav_map.nav_point
for point in epubNavPointsList:
    print('%s. %s'%(point.play_order, point.labels[0][0]))
chosenChapter = int(input("Give me the chapter: "))
curPoint = epubNavPointsList[chosenChapter - 1]
srcOfFile = curPoint.src
# this srcOfFile can contain #filepos attr at the end of string
# and we should remove it
position = srcOfFile.find('#')
srcOfFile = srcOfFile[:position]
curItem = book.get_item_by_href(srcOfFile)
readedItemB = book.read_item(curItem)
readedItem = readedItemB.decode('utf-8', 'ignore')
soup = BeautifulSoup(readedItem)
for script in soup(["script", "style"]):
    script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
textWithoutDots = text.translate({ord(i):None for i in '1234567“890:"!,.?/'})
listOfAllwords = textWithoutDots.split(' ')
print('There are %s words in text.'% len(listOfAllwords))
Example #39
0
#!/usr/bin/python
import epub

book = epub.open_epub('rpo.epub')

for item in book.opf.manifest.values():
    # read the content
    data = book.read_item(item)
    # get chapter 1

print data
import xml.etree.ElementTree as ET
tree = ET.fromstring(data)
print tree.findall('content')

# item = book.get_item('chapter')
# # display the same thing
# print book.read_item(item)

Example #40
0
def open_book(path):
    try:
        return open_epub(path)
    except (BadEpubFile, BadZipFile, KeyError, IndexError):
        return None
Example #41
0
# -*- coding: utf-8 -*-
__author__ = 'User'

import epub
import re
from xml.etree import ElementTree
from cStringIO import StringIO

class AllEntities:
    def __getitem__(self, key):
        # key is your entity, you can do whatever you want with it here
        if key == "nbsp":
            return ""
        return key

book = epub.open_epub('book.epub')
first_file_name = book.toc.nav_map.nav_point[4].src
print first_file_name

chapter = book.open(first_file_name)  # Open from zip
string = chapter.read()
print(string[:75])

first_file = StringIO(string)
parser = ElementTree.XMLParser();
parser.parser.UseForeignDTD(True); # because ElementTree chokes on &nbsp;
parser.entity = AllEntities()

etree = ElementTree.ElementTree()
tree = etree.parse(first_file, parser=parser)
Example #42
0
book_solr_connection = pysolr.Solr('http://localhost:8983/solr/book',
                                   timeout=10)
cover_solr_connection = pysolr.Solr('http://localhost:8983/solr/cover',
                                    timeout=10)

epub_data_directory = 'all_data'
text_data_directory = 'text'
sentences_per_page = 21
data_directory_files = os.listdir(epub_data_directory)
epub_directory_files = filter(lambda x: x.endswith('.epub'),
                              data_directory_files)
if not os.path.exists(text_data_directory):
    os.makedirs(text_data_directory)
for epub_file in epub_directory_files:
    print(epub_file)
    book = epub.open_epub(epub_data_directory + '/' + epub_file)
    date, creator, title = '', '', ''
    if len(book.opf.metadata.dates) > 0:
        date, _ = book.opf.metadata.dates[0]
        if date not in ['', 'NONE']:
            date = parse(date, fuzzy=True).year
        else:
            date = ''
    if len(book.opf.metadata.creators) > 0:
        creator, _, _ = book.opf.metadata.creators[0]
    if len(book.opf.metadata.titles) > 0:
        title, _ = book.opf.metadata.titles[0]
    description = book.opf.metadata.description
    if not description:
        description = ''
    cover_solr_connection.add([{
Example #43
0
from collections import Counter, OrderedDict
from base64 import b64encode
import argparse
from zipfile import ZipFile

parser = argparse.ArgumentParser(description="Convert epub books to JSON")
parser.add_argument('epub_source',
    help='the epub document to convert')
parser.add_argument('destination',
    help='file name to write JSON to')
parser.add_argument('--binary-zip', dest='zipfile',
    help='instead of base64 encoding binary data, write it to this zip file instead')

args = parser.parse_args()

book = epub.open_epub(args.epub_source)
# Extend BadgerFish rules (http://badgerfish.ning.com/) to
# - add tail properties from lxml as $2
# - removes stripping of whitespace around text as this may be needed
#   when displaying the book
class ExtBadgerFish(BadgerFish):
    def data(self, root):
        'Convert etree.Element into a dictionary'
        value = self.dict()
        children = [node for node in root if isinstance(node.tag, basestring)]
        for attr, attrval in root.attrib.items():
            attr = attr if self.attr_prefix is None else self.attr_prefix + attr
            value[attr] = self._convert(attrval)
        if root.text and self.text_content is not None:
            text = root.text.strip()
            if text:
Example #44
0
 def setUp(self):
     self.epub_file = epub.open_epub(self.epub_path)
     self.book = epub.Book(self.epub_file)
Example #45
0
 def setUp(self):
     self.epub_file = epub.open_epub(self.epub_path)
Example #46
0
def open_book(path):
	try:
		return open_epub(path)
	except (BadEpubFile, BadZipFile, KeyError, IndexError):
		return None
def open_book(path):
  try:
    return epub.open_epub(path)
  except:
    print 'Can`t open', path
Example #48
0
    set_mode((pygame.display.Info().current_w, \
              pygame.display.Info().current_h), FULLSCREEN)
'''

##### CONSTANTS #####

TIME_INTERVAL = set_word_speed()  # Converts WPM to decimal seconds.
FILE = choose_file()  # Choose EPUB file.
CONTENTS = []  # Contents of e-book (raw). To process before displaying.
PUNCTUATION_END = '.,?!:;}])\'">~*+=-%'
PUNCTUATION_START = '([{\'"<~*=-$%#@'
ENGLISH_DICTIONARY = enchant.Dict("en_US")

##### Add raw data to CONTENTS #####

book = epub.open_epub(FILE)
for item in book.opf.manifest.values():
    CONTENTS.append(book.read_item(item))

######## Begin program #############

for sentence in CONTENTS:
    for candidate_word in sentence.split():
        try:
            word = candidate_word.decode("utf-8")
            if ENGLISH_DICTIONARY.check(word):
                time.sleep(TIME_INTERVAL)
                print(word)
            elif word[0] in PUNCTUATION_START and word[-1] not in PUNCTUATION_END \
            and ENGLISH_DICTIONARY.check(word[1:]):
                time.sleep(TIME_INTERVAL)
def get_item(path_epub):
    book = epub.open_epub(path_epub)
    for item in book.opf.manifest.values() :
        print item.href
Example #50
0
			text.append(paragraph['#text'])
		elif 'span' in paragraph and '#text' in paragraph['span']:
			text.append(paragraph['span']['#text'])

	# some unicode symbols can be replaced by ascii-compatible characters
	# this helps with the NLP later
	text = [x.replace(u'\u00ad', '')	# word-wrap hyphens
			 .replace(u'\u2019', "'")
			 .replace(u'\u201c', '"')	# opening quotes
			 .replace(u'\u201d', '"')	# ending quotes
			 .replace(u'\u2014', '-')
			 .replace(u'\u2026', '...') for x in text]
	return text

file = sys.argv[1]
book = epub.open_epub(file)

db_conn = sqlite3.connect('data.db')
c = db_conn.cursor()
c.execute('''
		CREATE TABLE IF NOT EXISTS sentences
		(
		sentence_id INT,
		sentence TEXT,
		sentence_index INT,
		paragraph INT, 
		chapter INT, 
		title TEXT,
		PRIMARY KEY (sentence_id, title)
		)
	''')
Example #51
0
 def __init__(self, filename):
     assert isinstance(filename, unicode)
     self.pub = epub.open_epub(filename)
Example #52
0
# bk.add_item(c1); bk.toc = (epub.Link('chap_01.xhtml', 'Introduction', 'intro'),(epub.Section('Simple bk'), (c1, ))) # # add chapter; define Table Of Contents
# bk.add_item(epub.EpubNcx()); bk.add_item(epub.EpubNav()) # add default NCX and Nav file
# style = 'BODY {color: white;}'; nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # define CSS style
# bk.add_item(nav_css); bk.spine = ['nav', c1]; # add CSS file; basic spine
# epub.write_epub('test.epub', bk, {}) # write to the file
#---
import re
import epub as ep
from ebooklib import epub
# http://epub.exirel.me/genindex.html
dir1 = 'D:/scan0/lgl/'
for fnm in [
        'a_20160500_20130800.epub', 'b_20130115_2008.epub', 'c_2007_2002.epub',
        'd_2002_1991.epub', 'LexisNexis_201605_1991.epub'
]:  # kk=0;
    bk = ep.open_epub(dir1 + 'LN_aside/' + fnm)
    o = [q for q in bk.namelist() if -1 < q.find('index')]
    o1 = bk.read_item(o[0])
    for k in range(1, len(o)):
        q = bk.read_item(o[k])
        o1 = o1[:o1.rfind('</body></html>'
                          )] + q[q.find('<body class="calibre">') + 23:]
    x = [k.start() for k in re.finditer('\d* of \d* DOCUMENTS', o1)
         ] + [o1.rfind('</body></html>')]
    # create bk2
    bk2 = epub.EpubBook()
    bk2.set_identifier('LN')
    bk2.add_author('u')
    bk2.set_title(fnm[:-5])
    bk2.set_language('en')
    bk2.add_item(
    def __repr__(self):
        spl = self.content.split()[:10]
        if len(self.content) > 200:
            return ' '.join(spl[:20]) + '...'
        else:
            return ' '.join(spl[:20])

IndexRange = namedtuple('IndexRange', ['len', 'beg', 'end'])

if __name__ == '__main__':


    filepath = sys.argv[-2]
    bookpath = sys.argv[-1]

    book = epub.open_epub(bookpath)
    mydc = dict((key, book.read_item(item)) for key, item in book.opf.manifest.items())

    # compile the list of notes
    note_list = []
    for line_orig in open(filepath).readlines():
        if line_orig.lstrip().startswith('Add a note'):
            continue
        line_type = None

        # preprocessing
        line = line_orig.replace('• Delete this highlight', '').strip()
        if not line:
            continue
        if line.startswith('Note: ') and line.endswith('Edit'):
            line_type = KindleNote.NOTE
    def __repr__(self):
        spl = self.content.split()[:10]
        if len(self.content) > 200:
            return ' '.join(spl[:20]) + '...'
        else:
            return ' '.join(spl[:20])


IndexRange = namedtuple('IndexRange', ['len', 'beg', 'end'])

if __name__ == '__main__':

    filepath = sys.argv[-2]
    bookpath = sys.argv[-1]

    book = epub.open_epub(bookpath)
    mydc = dict(
        (key, book.read_item(item)) for key, item in book.opf.manifest.items())

    # compile the list of notes
    note_list = []
    for line_orig in open(filepath).readlines():
        if line_orig.lstrip().startswith('Add a note'):
            continue
        line_type = None

        # preprocessing
        line = line_orig.replace('• Delete this highlight', '').strip()
        if not line:
            continue
        if line.startswith('Note: ') and line.endswith('Edit'):
Example #55
0
#

from sys import argv
from os import listdir
from epub import open_epub

filelist = listdir(".")

if len(argv) > 1:
    for filename in filelist:
        if filename.endswith(".epub"):
            book = open_epub(filename, "r")
            print filename[:-5], ":", book.opf.metadata.titles[0][0]
            book.close()
else:
    for filename in filelist:
        if filename.endswith(".epub"):
            book = open_epub(filename, "a")
            book.opf.metadata.titles = [(filename[:-5].decode('utf8'), u"")]
            book.close()
Example #56
0
def epub_to_txt(
        epub_file_name,
        file_dir="epub-files",
        output_file_dir="txt-files",
        chapter_files_dir=None,
        debug=False,
        dry_run=False):
    if chapter_files_dir is None:
        chapter_files_dir = os.path.join(output_file_dir, CHAPTERS)
    _try_mkdirs(output_file_dir)
    _try_mkdirs(chapter_files_dir)

    html_to_text = html2text.HTML2Text()
    html_to_text.ignore_links = True

    # Ignore hidden files
    if epub_file_name[0] == '.':
        return
    # Ignore files that don't have the epub extension
    if os.path.splitext(epub_file_name)[1] != ".epub":
        return

    print("Opening file: %s" % epub_file_name)
    ebook = epub.open_epub(os.path.join(file_dir, epub_file_name))
    book_title = ebook.toc.title
    print("Starting on book: %s" % book_title)

    # Works with Expanse, old code
    """
    play_order = [nav_point.play_order for nav_point in ebook.toc.nav_map.nav_point]
    play_order_labels = [str(nav_point.play_order) for nav_point in ebook.toc.nav_map.nav_point]
    labels = [nav_point.labels[0][0] for nav_point in ebook.toc.nav_map.nav_point]
    source_references = [nav_point.src for nav_point in ebook.toc.nav_map.nav_point]
    """

    play_order = list()
    play_order_labels = list()
    labels = list()
    source_references = list()

    def get_all_nav_points(nav_point):
        if len(nav_point.nav_point) == 0:
            return [nav_point]
        else:
            nav_points_list = [nav_point]
            for sub_nav_point in nav_point.nav_point:
                nav_points_list += get_all_nav_points(sub_nav_point)
            return nav_points_list

    for nav_point_root in ebook.toc.nav_map.nav_point:
        for nav_point in get_all_nav_points(nav_point_root):
            play_order.append(nav_point.play_order)
            play_order_labels.append(str(nav_point.play_order))
            labels.append(nav_point.labels[0][0])
            source_references.append(nav_point.src)


    #"""
    play_order_label_to_index = dict([(x[0], index) for index, x in enumerate(ebook.opf.spine.itemrefs)])

    play_order_labels = [x[0] for x in ebook.opf.spine.itemrefs]
    play_order = [str(i) for i in range(len(ebook.opf.spine.itemrefs))]
    labels = sorted(
        list(
            filter(
                lambda x: x in play_order_label_to_index,
                ebook.opf.manifest.keys())),
        key=lambda x: play_order_label_to_index[x])
    source_references = list(
        map(
            lambda x: x[1],
            sorted(
                list(filter(lambda x: x[0] in play_order_label_to_index, ebook.opf.manifest.items())),
                key=lambda x: play_order_label_to_index[x[0]])))
    #"""

    if debug:
        print("play_order:\n'%s'\n\n" % str(play_order))
        print("play_order_labels:\n'%s'\n\n" % str(play_order_labels))
        # print("play_order_label_to_index dict:\n'%s'\n\n" % str(play_order_label_to_index))
        print("labels:\n'%s'\n\n" % str(labels))
        print("source_references:\n'%s'\n\n" % str(source_references))

    assert len(labels) == len(source_references) and len(labels) == len(play_order_labels), (
        "Not true that: len(labels): '%d' == len(source_references): '%d' and len(labels): '%d' == len(play_order_labels): '%d'"
        % (len(labels), len(source_references), len(labels), len(play_order_labels)))

    chapter_label_source_tuples = list(zip(play_order, play_order_labels, labels, source_references))
    if debug:
        print("chapter_label_source_tuples:\n%s\n\n" % "\n".join(list(map(lambda x: str(x), chapter_label_source_tuples))))

    full_book_content = list()
    for chapter_order, chapter_order_label, chapter_title, source_ref in chapter_label_source_tuples:
        chapter_info_string = "Book: %s Chapter: %s titled: %s"\
            % (book_title, chapter_order, chapter_title)
        try:
            chapter_content = ebook.read_item(source_ref)
            if debug:
                print("chapter_order: '%s', chapter_order_label: '%s', chapter_title: '%s', source_ref: '%s', read source_ref:\n'%s'\n\n"
                    % (
                        chapter_order,
                        str(chapter_order_label),
                        str(chapter_title),
                        str(source_ref),
                        str(ebook.read_item(source_ref)[:min(20, len(ebook.read_item(source_ref)))])))
        except Exception as e:
            print("Exception: e: '%s'" % str(e))
            if isinstance(e, KeyError):
                print("KeyError: e: '%s'" % str(e))
                sys.stdout.flush()
                if ".jpg" in str(e):
                    continue
                else:
                    raise e

            print("Failed getting chapter: %s %s in book %s, exception: %s"
                % (chapter_order, chapter_title, ebook.toc.title, str(e)))
            ref_fixed = re.sub("#.*", "", source_ref)
            try:
                chapter_content = ebook.read_item(ref_fixed)
                print("Success on retry! %s" % chapter_info_string)
            except:
                print("FAILED ON RETRY TOO for book titled: %s with ref: %s."
                    % (book_title, ref_fixed))
        try:
            string_chapter_content = chapter_content.decode("utf-8")
        except UnicodeDecodeError as e:
            print("TypeError while decoding content with UTF-8 on chapter titled: '%s'" % chapter_title)
            continue
        chapter_content = html_to_text.handle(str(string_chapter_content))
        full_book_content.append((chapter_order, chapter_title, chapter_content))
    with open(os.path.join(output_file_dir, os.path.splitext(epub_file_name)[0] + ".txt"), "w") as txt_file:
        for chapter_index, chapter_tuple in enumerate(full_book_content):
            order = chapter_tuple[0]
            if order.strip() == "":
                order = str(chapter_index)
            title = chapter_tuple[1]
            content = chapter_tuple[2]

            if not dry_run:
                txt_file.write(content)

            chapter_file_name = epub_file_name.replace(".epub", "")
            chapter_file_name += "--" + order.zfill(5) + "--" + title
            chapter_file_name += ".txt"
            if not dry_run:
                with open(os.path.join(chapter_files_dir, chapter_file_name), "w") as chapter_txt_file:
                    chapter_txt_file.write(content)

    ebook.close()