Ejemplos de BeautifulStoneSoup en Python, ejemplos de calibre.ebooks.BeautifulSoup.BeautifulStoneSoup en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: convert_from.py Proyecto: jayd2446/calibre

    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()

Ejemplo n.º 2

0

Mostrar archivo

    def compile_ui(self):
        pat = re.compile(r'''(['"]):/images/([^'"]+)\1''')

        def sub(match):
            ans = 'I(%s%s%s)' % (match.group(1), match.group(2),
                                 match.group(1))
            return ans

        # >>> Entry point
        self._log_location()

        compiled_forms = {}
        self._find_forms()

        # Cribbed from gui2.__init__:build_forms()
        for form in self.forms:
            with open(form) as form_file:
                soup = BeautifulStoneSoup(form_file.read())
                property = soup.find('property', attrs={'name': 'windowTitle'})
                string = property.find('string')
                window_title = string.renderContents()

            compiled_form = self._form_to_compiled_form(form)
            if (not os.path.exists(compiled_form) or
                    os.stat(form).st_mtime > os.stat(compiled_form).st_mtime):

                if not os.path.exists(compiled_form):
                    if self.verbose:
                        self._log(' compiling %s' % form)
                else:
                    if self.verbose:
                        self._log(' recompiling %s' % form)
                    os.remove(compiled_form)
                buf = cStringIO.StringIO()
                compileUi(form, buf)
                dat = buf.getvalue()
                dat = dat.replace('__appname__', 'calibre')
                dat = dat.replace('import images_rc', '')
                dat = re.compile(
                    r'(?:QtGui.QApplication.translate|(?<!def )_translate)\(.+?,\s+"(.+?)(?<!\\)",.+?\)'
                ).sub(r'_("\1")', dat)
                dat = dat.replace('_("MMM yyyy")', '"MMM yyyy"')
                dat = pat.sub(sub, dat)
                with open(compiled_form, 'wb') as cf:
                    cf.write(dat)

            compiled_forms[window_title] = compiled_form.rpartition(
                os.sep)[2].partition('.')[0]
        return compiled_forms

Ejemplo n.º 3

0

Mostrar archivo

Archivo: epub.py Proyecto: syn-gowthamsrungarapu/calibre

 def __init__(self, stream=None):
     if not stream: return
     soup = BeautifulStoneSoup(stream.read())
     container = soup.find(name=re.compile(r'container$', re.I))
     if not container:
         raise OCFException("<container> element missing")
     if container.get('version', None) != '1.0':
         raise EPubException("unsupported version of OCF")
     rootfiles = container.find(re.compile(r'rootfiles$', re.I))
     if not rootfiles:
         raise EPubException("<rootfiles/> element missing")
     for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)):
         try:
             self[rootfile['media-type']] = rootfile['full-path']
         except KeyError:
             raise EPubException("<rootfile/> element malformed")

Ejemplo n.º 4

0

Mostrar archivo

    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
                       convertEntities=BeautifulStoneSoup.XML_ENTITIES,
                       selfClosingTags=self.SELF_CLOSING_TAGS)
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: epub.py Proyecto: Eksmo/calibre

 def __init__(self, stream=None):
     if not stream: return
     soup = BeautifulStoneSoup(stream.read())
     container = soup.find(name=re.compile(r'container$', re.I))
     if not container:
         raise OCFException("<container> element missing")
     if container.get('version', None) != '1.0':
         raise EPubException("unsupported version of OCF")
     rootfiles = container.find(re.compile(r'rootfiles$', re.I))
     if not rootfiles:
         raise EPubException("<rootfiles/> element missing")
     for rootfile in rootfiles.findAll(re.compile(r'rootfile$', re.I)):
         try:
             self[rootfile['media-type']] = rootfile['full-path']
         except KeyError:
             raise EPubException("<rootfile/> element malformed")

Ejemplo n.º 6

0

Mostrar archivo

Archivo: common_utils.py Proyecto: kbw1/calibre-marvin-manager

    def compile_ui(self):
        pat = re.compile(r"""(['"]):/images/([^'"]+)\1""")

        def sub(match):
            ans = "I(%s%s%s)" % (match.group(1), match.group(2), match.group(1))
            return ans

        # >>> Entry point
        self._log_location()

        compiled_forms = {}
        self._find_forms()

        # Cribbed from gui2.__init__:build_forms()
        for form in self.forms:
            with open(form) as form_file:
                soup = BeautifulStoneSoup(form_file.read())
                property = soup.find("property", attrs={"name": "windowTitle"})
                string = property.find("string")
                window_title = string.renderContents()

            compiled_form = self._form_to_compiled_form(form)
            if not os.path.exists(compiled_form) or os.stat(form).st_mtime > os.stat(compiled_form).st_mtime:

                if not os.path.exists(compiled_form):
                    if self.verbose:
                        self._log(" compiling %s" % form)
                else:
                    if self.verbose:
                        self._log(" recompiling %s" % form)
                    os.remove(compiled_form)
                buf = cStringIO.StringIO()
                compileUi(form, buf)
                dat = buf.getvalue()
                dat = dat.replace("__appname__", "calibre")
                dat = dat.replace("import images_rc", "")
                dat = re.compile(
                    r'(?:QtGui.QApplication.translate|(?<!def )_translate)\(.+?,\s+"(.+?)(?<!\\)",.+?\)'
                ).sub(r'_("\1")', dat)
                dat = dat.replace('_("MMM yyyy")', '"MMM yyyy"')
                dat = pat.sub(sub, dat)
                with open(compiled_form, "wb") as cf:
                    cf.write(dat)

            compiled_forms[window_title] = compiled_form.rpartition(os.sep)[2].partition(".")[0]
        return compiled_forms

Ejemplo n.º 7

0

Mostrar archivo

Archivo: common_utils.py Proyecto: GRiker/calibre-syncman

    def compile_ui(self):
        pat = re.compile(r'''(['"]):/images/([^'"]+)\1''')

        def sub(match):
            ans = 'I(%s%s%s)' % (match.group(1), match.group(2), match.group(1))
            return ans

        # >>> Entry point

        compiled_forms = {}
        self._find_forms()

        # Cribbed from gui2.__init__:build_forms()
        for form in self.forms:
            with open(form) as form_file:
                soup = BeautifulStoneSoup(form_file.read())
                property = soup.find('property', attrs={'name': 'windowTitle'})
                string = property.find('string')
                window_title = string.renderContents()

            compiled_form = self._form_to_compiled_form(form)
            if (not os.path.exists(compiled_form) or
                    os.stat(form).st_mtime > os.stat(compiled_form).st_mtime):

                if not os.path.exists(compiled_form):
                    if DEBUG:
                        debug_print(' compiling {}'.format(form))
                else:
                    if DEBUG:
                        debug_print(' recompiling {}'.format(form))
                    os.remove(compiled_form)
                buf = cStringIO.StringIO()
                compileUi(form, buf)
                dat = buf.getvalue()
                dat = dat.replace('__appname__', 'calibre')
                dat = dat.replace('import images_rc', '')
                dat = re.compile(r'(?:QtGui.QApplication.translate|(?<!def )_translate)\(.+?,\s+"(.+?)(?<!\\)",.+?\)').sub(r'_("\1")', dat)
                dat = dat.replace('_("MMM yyyy")', '"MMM yyyy"')
                dat = pat.sub(sub, dat)
                with open(compiled_form, 'wb') as cf:
                    cf.write(dat)

            compiled_forms[window_title] = compiled_form.rpartition(os.sep)[2].partition('.')[0]
            os.remove(form)
        return compiled_forms

Ejemplo n.º 8

0

Mostrar archivo

Archivo: convert_from.py Proyecto: JimmXinu/calibre

    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: convert_from.py Proyecto: 089git/calibre

    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
                       convertEntities=BeautifulStoneSoup.XML_ENTITIES,
                       selfClosingTags=self.SELF_CLOSING_TAGS)
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: convert_from.py Proyecto: pombreda/calibre-1

class LrsParser(object):

    SELF_CLOSING_TAGS = [
        i.lower() for i in [
            'CR', 'Plot', 'NoBR', 'Space', 'PutObj', 'RuledLine', 'Plot',
            'SetDefault', 'BookSetting', 'RegistFont', 'PageStyle',
            'TextStyle', 'BlockStyle', 'JumpTo', 'ImageStream', 'Image'
        ]
    ]

    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(
            xml_to_unicode(src)[0],
            convertEntities=BeautifulStoneSoup.XML_ENTITIES,
            selfClosingTags=self.SELF_CLOSING_TAGS)
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()

    def fifth_pass(self):
        for tag in self.soup.findAll(['canvas', 'header', 'footer']):
            canvas = self.parsed_objects[tag.get('objid')]
            for po in tag.findAll('putobj'):
                canvas.put_object(self.parsed_objects[po.get('refobj')],
                                  po.get('x1'), po.get('y1'))

    @classmethod
    def attrs_to_dict(cls, tag, exclude=('objid', )):
        result = {}
        for key, val in tag.attrs:
            if key in exclude:
                continue
            result[str(key)] = val
        return result

    def text_tag_to_element(self, tag):
        map = {
            'span': Span,
            'italic': Italic,
            'bold': Bold,
            'empline': EmpLine,
            'sup': Sup,
            'sub': Sub,
            'cr': CR,
            'drawchar': DropCaps,
        }
        if tag.name == 'charbutton':
            return CharButton(self.parsed_objects[tag.get('refobj')], None)
        if tag.name == 'plot':
            return Plot(self.parsed_objects[tag.get('refobj')],
                        **self.attrs_to_dict(tag, ['refobj']))
        settings = self.attrs_to_dict(tag)
        settings.pop('spanstyle', '')
        return map[tag.name](**settings)

    def process_text_element(self, tag, elem):
        for item in tag.contents:
            if isinstance(item, NavigableString):
                elem.append(item.string)
            else:
                subelem = self.text_tag_to_element(item)
                elem.append(subelem)
                self.process_text_element(item, subelem)

    def process_paragraph(self, tag):
        p = Paragraph()
        contents = [i for i in tag.contents]
        if contents:
            if isinstance(contents[0], NavigableString):
                contents[0] = contents[0].string.lstrip()
            for item in contents:
                if isinstance(item, basestring):
                    p.append(item)
                elif isinstance(item, NavigableString):
                    p.append(item.string)
                else:
                    elem = self.text_tag_to_element(item)
                    p.append(elem)
                    self.process_text_element(item, elem)
        return p

    def process_text_block(self, tag):
        tb = self.parsed_objects[tag.get('objid')]
        for item in tag.contents:
            if hasattr(item, 'name'):
                if item.name == 'p':
                    tb.append(self.process_paragraph(item))
                elif item.name == 'cr':
                    tb.append(CR())
                elif item.name == 'charbutton':  # BookDesigner does this
                    p = Paragraph()
                    tb.append(p)
                    elem = self.text_tag_to_element(item)
                    self.process_text_element(item, elem)
                    p.append(elem)

    def fourth_pass(self):
        for tag in self.soup.findAll('page'):
            page = self.parsed_objects[tag.get('objid')]
            self.book.append(page)
            for block_tag in tag.findAll([
                    'canvas', 'imageblock', 'textblock', 'ruledline',
                    'simpletextblock'
            ]):
                if block_tag.name == 'ruledline':
                    page.append(RuledLine(**self.attrs_to_dict(block_tag)))
                else:
                    page.append(self.parsed_objects[block_tag.get('objid')])

        for tag in self.soup.find('objects').findAll('button'):
            jt = tag.find('jumpto')
            tb = self.parsed_objects[jt.get('refobj')]
            jb = JumpButton(tb)
            self.book.append(jb)
            self.parsed_objects[tag.get('objid')] = jb

        for tag in self.soup.findAll(['textblock', 'simpletextblock']):
            self.process_text_block(tag)
        toc = self.soup.find('toc')
        if toc:
            for tag in toc.findAll('toclabel'):
                label = self.tag_to_string(tag)
                self.book.addTocEntry(label,
                                      self.parsed_objects[tag.get('refobj')])

    def third_pass(self):
        map = {
            'page': (Page, [
                'pagestyle', 'evenfooterid', 'oddfooterid', 'evenheaderid',
                'oddheaderid'
            ]),
            'textblock': (TextBlock, ['textstyle', 'blockstyle']),
            'simpletextblock': (TextBlock, ['textstyle', 'blockstyle']),
            'imageblock': (ImageBlock, ['blockstyle', 'refstream']),
            'image': (Image, ['refstream']),
            'canvas': (Canvas, ['canvaswidth', 'canvasheight']),
        }
        attrmap = {
            'pagestyle': 'pageStyle',
            'blockstyle': 'blockStyle',
            'textstyle': 'textStyle',
        }
        for id, tag in self.objects.items():
            if tag.name in map.keys():
                settings = self.attrs_to_dict(
                    tag, map[tag.name][1] + ['objid', 'objlabel'])
                for a in ('pagestyle', 'blockstyle', 'textstyle'):
                    label = tag.get(a, False)
                    if label and \
                        (label in self._style_labels or label in self.parsed_objects):
                        _obj = self.parsed_objects[label] if \
                            self.parsed_objects.has_key(label) else \
                            self._style_labels[label]
                        settings[attrmap[a]] = _obj
                for a in ('evenfooterid', 'oddfooterid', 'evenheaderid',
                          'oddheaderid'):
                    if tag.has_key(a):
                        settings[a.replace(
                            'id', '')] = self.parsed_objects[tag.get(a)]
                args = []
                if tag.has_key('refstream'):
                    args.append(self.parsed_objects[tag.get('refstream')])
                if tag.has_key('canvaswidth'):
                    args += [tag.get('canvaswidth'), tag.get('canvasheight')]
                self.parsed_objects[id] = map[tag.name][0](*args, **settings)

    def second_pass(self):
        map = {
            'pagestyle': (PageStyle, [
                'stylelabel', 'evenheaderid', 'oddheaderid', 'evenfooterid',
                'oddfooterid'
            ]),
            'textstyle': (TextStyle, ['stylelabel', 'rubyalignandadjust']),
            'blockstyle': (BlockStyle, ['stylelabel']),
            'imagestream': (ImageStream, ['imagestreamlabel']),
            'registfont': (Font, [])
        }
        self._style_labels = {}
        for id, tag in self.objects.items():
            if tag.name in map.keys():
                settings = self.attrs_to_dict(tag,
                                              map[tag.name][1] + ['objid'])
                if tag.name == 'pagestyle':
                    for a in ('evenheaderid', 'oddheaderid', 'evenfooterid',
                              'oddfooterid'):
                        if tag.has_key(a):
                            settings[a.replace(
                                'id', '')] = self.parsed_objects[tag.get(a)]
                settings.pop('autoindex', '')
                self.parsed_objects[id] = map[tag.name][0](**settings)
                x = tag.get('stylelabel', False)
                if x:
                    self._style_labels[x] = self.parsed_objects[id]
                if tag.name == 'registfont':
                    self.book.append(self.parsed_objects[id])

    @classmethod
    def tag_to_string(cls, tag):
        '''
        Convenience method to take a BeautifulSoup Tag and extract the text from it
        recursively.
        @return: A unicode (possibly empty) object
        '''
        if not tag:
            return ''
        strings = []
        for item in tag.contents:
            if isinstance(item, (NavigableString, CData)):
                strings.append(item.string)
            elif isinstance(item, Tag):
                res = cls.tag_to_string(item)
                if res:
                    strings.append(res)
        return u''.join(strings)

    def first_pass(self):
        info = self.soup.find('bbebxylog').find('bookinformation').find('info')
        bookinfo = info.find('bookinfo')
        docinfo = info.find('docinfo')

        def me(base, tagname):
            tag = base.find(tagname.lower())
            if tag is None:
                return ('', '', '')
            tag = (self.tag_to_string(tag),
                   tag.get('reading') if tag.has_key('reading') else '')
            return tag

        title = me(bookinfo, 'Title')
        author = me(bookinfo, 'Author')
        publisher = me(bookinfo, 'Publisher')
        category = me(bookinfo, 'Category')[0]
        classification = me(bookinfo, 'Classification')[0]
        freetext = me(bookinfo, 'FreeText')[0]
        language = me(docinfo, 'Language')[0]
        creator = me(docinfo, 'Creator')[0]
        producer = me(docinfo, 'Producer')[0]
        bookid = me(bookinfo, 'BookID')[0]

        sd = self.soup.find('setdefault')
        sd = StyleDefault(
            **self.attrs_to_dict(sd, ['page_tree_id', 'rubyalignandadjust']))
        bs = self.soup.find('booksetting')
        bs = BookSetting(**self.attrs_to_dict(bs, []))

        settings = {}
        thumbnail = self.soup.find('cthumbnail')
        if thumbnail is not None:
            f = thumbnail['file']
            if os.access(f, os.R_OK):
                settings['thumbnail'] = f
            else:
                print _('Could not read from thumbnail file:'), f

        self.book = Book(title=title,
                         author=author,
                         publisher=publisher,
                         category=category,
                         classification=classification,
                         freetext=freetext,
                         language=language,
                         creator=creator,
                         producer=producer,
                         bookid=bookid,
                         setdefault=sd,
                         booksetting=bs,
                         **settings)

        for hdr in self.soup.findAll(['header', 'footer']):
            elem = Header if hdr.name == 'header' else Footer
            self.parsed_objects[hdr.get('objid')] = elem(
                **self.attrs_to_dict(hdr))

    def render(self, file, to_lrs=False):
        if to_lrs:
            self.book.renderLrs(file, 'utf-8')
        else:
            self.book.renderLrf(file)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: __init__.py Proyecto: kmshi/calibre-ios-reader-applications

    def rebuild_collections(self, booklist, oncard):
        '''
        For each book in the booklist for the card oncard, remove it from all
        its current collections, then add it to the collections specified in
        device_collections.

        oncard is None for the main memory, carda for card A, cardb for card B,
        etc.

        booklist is the object created by the :method:`books` call above.

        This is called after the user edits the 'Collections' field in the Device view
        when Metadata management is set to 'Manual'.

        '''
        self._log_location()

        command_name = "rebuild_collections"
        command_element = "rebuildcollections"
        command_soup = BeautifulStoneSoup(self.parent.COMMAND_XML.format(
            command_element, time.mktime(time.localtime())))

        LOCAL_DEBUG = False
        if booklist:
            changed = 0
            for book in booklist:
                if LOCAL_DEBUG:
                    self._log("{0:7} {1}".format(book.in_library, book.title))

                filename = self.parent.path_template.format(book.uuid)
                if filename not in self.parent.cached_books:
                    for fn in self.parent.cached_books:
                        if book.uuid and book.uuid == self.parent.cached_books[fn]['uuid']:
                            if LOCAL_DEBUG:
                                self._log("'%s' matched on uuid %s" % (book.title, book.uuid))
                            filename = fn
                            break
                        elif (book.title == self.parent.cached_books[fn]['title'] and
                              book.authors == self.parent.cached_books[fn]['authors']):
                            if LOCAL_DEBUG:
                                self._log("'%s' matched on title/author" % book.title)
                            filename = fn
                            break
                    else:
                        self._log("ERROR: file %s not found in cached_books" % repr(filename))
                        continue

                cached_collections = self.parent.cached_books[filename]['device_collections']
                if cached_collections != book.device_collections:
                    # Append the changed book info to the command file
                    book_tag = Tag(command_soup, 'book')
                    book_tag['filename'] = filename
                    book_tag['title'] = book.title
                    book_tag['author'] = ', '.join(book.authors)
                    book_tag['uuid'] = book.uuid

                    collections_tag = Tag(command_soup, 'collections')
                    for tag in book.device_collections:
                        c_tag = Tag(command_soup, 'collection')
                        c_tag.insert(0, tag)
                        collections_tag.insert(0, c_tag)
                    book_tag.insert(0, collections_tag)

                    command_soup.manifest.insert(0, book_tag)

                    # Update cache
                    self.parent.cached_books[filename]['device_collections'] = book.device_collections

                    changed += 1

            if changed:
                # Stage the command file
                self.parent._stage_command_file(command_name, command_soup,
                                                show_command=self.parent.prefs.get('development_mode', False))

                # Wait for completion
                self.parent._wait_for_command_completion(command_name)
            else:
                self._log("no collection changes detected cached_books <=> device books")

Ejemplo n.º 12

0

Mostrar archivo

Archivo: convert_from.py Proyecto: 089git/calibre

class LrsParser(object):

    SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space',
                         'PutObj', 'RuledLine',
                         'Plot', 'SetDefault', 'BookSetting', 'RegistFont',
                         'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo',
                         'ImageStream', 'Image']]

    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0],
                       convertEntities=BeautifulStoneSoup.XML_ENTITIES,
                       selfClosingTags=self.SELF_CLOSING_TAGS)
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()

    def fifth_pass(self):
        for tag in self.soup.findAll(['canvas', 'header', 'footer']):
            canvas = self.parsed_objects[tag.get('objid')]
            for po in tag.findAll('putobj'):
                canvas.put_object(self.parsed_objects[po.get('refobj')],
                                  po.get('x1'), po.get('y1'))


    @classmethod
    def attrs_to_dict(cls, tag, exclude=('objid',)):
        result = {}
        for key, val in tag.attrs:
            if key in exclude:
                continue
            result[str(key)] = val
        return result

    def text_tag_to_element(self, tag):
        map = {
               'span'    : Span,
               'italic'  : Italic,
               'bold'    : Bold,
               'empline' : EmpLine,
               'sup'     : Sup,
               'sub'     : Sub,
               'cr'      : CR,
               'drawchar': DropCaps,
               }
        if tag.name == 'charbutton':
            return CharButton(self.parsed_objects[tag.get('refobj')], None)
        if tag.name == 'plot':
            return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj']))
        settings = self.attrs_to_dict(tag)
        settings.pop('spanstyle', '')
        return map[tag.name](**settings)

    def process_text_element(self, tag, elem):
        for item in tag.contents:
            if isinstance(item, NavigableString):
                elem.append(item.string)
            else:
                subelem = self.text_tag_to_element(item)
                elem.append(subelem)
                self.process_text_element(item, subelem)


    def process_paragraph(self, tag):
        p = Paragraph()
        contents = [i for i in tag.contents]
        if contents:
            if isinstance(contents[0], NavigableString):
                contents[0] = contents[0].string.lstrip()
            for item in contents:
                if isinstance(item, basestring):
                    p.append(item)
                elif isinstance(item, NavigableString):
                    p.append(item.string)
                else:
                    elem = self.text_tag_to_element(item)
                    p.append(elem)
                    self.process_text_element(item, elem)
        return p

    def process_text_block(self, tag):
        tb = self.parsed_objects[tag.get('objid')]
        for item in tag.contents:
            if hasattr(item, 'name'):
                if item.name == 'p':
                    tb.append(self.process_paragraph(item))
                elif item.name == 'cr':
                    tb.append(CR())
                elif item.name == 'charbutton': # BookDesigner does this
                    p = Paragraph()
                    tb.append(p)
                    elem = self.text_tag_to_element(item)
                    self.process_text_element(item, elem)
                    p.append(elem)

    def fourth_pass(self):
        for tag in self.soup.findAll('page'):
            page = self.parsed_objects[tag.get('objid')]
            self.book.append(page)
            for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock',
                                          'ruledline', 'simpletextblock']):
                if block_tag.name == 'ruledline':
                    page.append(RuledLine(**self.attrs_to_dict(block_tag)))
                else:
                    page.append(self.parsed_objects[block_tag.get('objid')])

        for tag in self.soup.find('objects').findAll('button'):
            jt = tag.find('jumpto')
            tb = self.parsed_objects[jt.get('refobj')]
            jb = JumpButton(tb)
            self.book.append(jb)
            self.parsed_objects[tag.get('objid')] = jb

        for tag in self.soup.findAll(['textblock', 'simpletextblock']):
            self.process_text_block(tag)
        toc = self.soup.find('toc')
        if toc:
            for tag in toc.findAll('toclabel'):
                label = self.tag_to_string(tag)
                self.book.addTocEntry(label, self.parsed_objects[tag.get('refobj')])


    def third_pass(self):
        map = {
               'page'       : (Page, ['pagestyle', 'evenfooterid',
                                      'oddfooterid', 'evenheaderid', 'oddheaderid']),
               'textblock'  : (TextBlock, ['textstyle', 'blockstyle']),
               'simpletextblock'  : (TextBlock, ['textstyle', 'blockstyle']),
               'imageblock' : (ImageBlock, ['blockstyle', 'refstream']),
               'image'      : (Image, ['refstream']),
               'canvas'     : (Canvas, ['canvaswidth', 'canvasheight']),
               }
        attrmap = {
                   'pagestyle'  : 'pageStyle',
                   'blockstyle' : 'blockStyle',
                   'textstyle'  : 'textStyle',
                   }
        for id, tag in self.objects.items():
            if tag.name in map.keys():
                settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel'])
                for a in ('pagestyle', 'blockstyle', 'textstyle'):
                    label = tag.get(a, False)
                    if label and \
                        (label in self._style_labels or label in self.parsed_objects):
                        _obj = self.parsed_objects[label] if \
                            self.parsed_objects.has_key(label) else \
                            self._style_labels[label]
                        settings[attrmap[a]] = _obj
                for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'):
                    if tag.has_key(a):
                        settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
                args = []
                if tag.has_key('refstream'):
                    args.append(self.parsed_objects[tag.get('refstream')])
                if tag.has_key('canvaswidth'):
                    args += [tag.get('canvaswidth'), tag.get('canvasheight')]
                self.parsed_objects[id] = map[tag.name][0](*args, **settings)



    def second_pass(self):
        map = {
               'pagestyle'  : (PageStyle, ['stylelabel', 'evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid']),
               'textstyle'  : (TextStyle, ['stylelabel', 'rubyalignandadjust']),
               'blockstyle' : (BlockStyle, ['stylelabel']),
               'imagestream': (ImageStream, ['imagestreamlabel']),
               'registfont' : (Font, [])
               }
        self._style_labels = {}
        for id, tag in self.objects.items():
            if tag.name in map.keys():
                settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid'])
                if tag.name == 'pagestyle':
                    for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'):
                        if tag.has_key(a):
                            settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)]
                settings.pop('autoindex', '')
                self.parsed_objects[id] = map[tag.name][0](**settings)
                x = tag.get('stylelabel', False)
                if x:
                    self._style_labels[x] = self.parsed_objects[id]
                if tag.name == 'registfont':
                    self.book.append(self.parsed_objects[id])


    @classmethod
    def tag_to_string(cls, tag):
        '''
        Convenience method to take a BeautifulSoup Tag and extract the text from it
        recursively.
        @return: A unicode (possibly empty) object
        '''
        if not tag:
            return ''
        strings = []
        for item in tag.contents:
            if isinstance(item, (NavigableString, CData)):
                strings.append(item.string)
            elif isinstance(item, Tag):
                res = cls.tag_to_string(item)
                if res:
                    strings.append(res)
        return u''.join(strings)

    def first_pass(self):
        info = self.soup.find('bbebxylog').find('bookinformation').find('info')
        bookinfo = info.find('bookinfo')
        docinfo  = info.find('docinfo')

        def me(base, tagname):
            tag = base.find(tagname.lower())
            if tag is None:
                return ('', '', '')
            tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '')
            return tag

        title          = me(bookinfo, 'Title')
        author         = me(bookinfo, 'Author')
        publisher      = me(bookinfo, 'Publisher')
        category       = me(bookinfo, 'Category')[0]
        classification = me(bookinfo, 'Classification')[0]
        freetext       = me(bookinfo, 'FreeText')[0]
        language       = me(docinfo, 'Language')[0]
        creator        = me(docinfo, 'Creator')[0]
        producer       = me(docinfo, 'Producer')[0]
        bookid         = me(bookinfo, 'BookID')[0]

        sd = self.soup.find('setdefault')
        sd = StyleDefault(**self.attrs_to_dict(sd, ['page_tree_id', 'rubyalignandadjust']))
        bs = self.soup.find('booksetting')
        bs = BookSetting(**self.attrs_to_dict(bs, []))

        settings = {}
        thumbnail = self.soup.find('cthumbnail')
        if thumbnail is not None:
            f = thumbnail['file']
            if os.access(f, os.R_OK):
                settings['thumbnail'] = f
            else:
                print _('Could not read from thumbnail file:'), f

        self.book = Book(title=title, author=author, publisher=publisher,
                         category=category, classification=classification,
                         freetext=freetext, language=language, creator=creator,
                         producer=producer, bookid=bookid, setdefault=sd,
                         booksetting=bs, **settings)

        for hdr in self.soup.findAll(['header', 'footer']):
            elem = Header if hdr.name == 'header' else Footer
            self.parsed_objects[hdr.get('objid')] = elem(**self.attrs_to_dict(hdr))

    def render(self, file, to_lrs=False):
        if to_lrs:
            self.book.renderLrs(file, 'utf-8')
        else:
            self.book.renderLrf(file)