Python fromstring Examples, calibre.utils.soupparser.fromstring Python Examples

Example #1

0

Show file

File: comments_editor.py Project: IsaacVilela/calibre

        def fget(self):
            ans = u''
            try:
                check = unicode(self.page().mainFrame().toPlainText()).strip()
                raw = unicode(self.page().mainFrame().toHtml())
                raw = xml_to_unicode(raw, strip_encoding_pats=True,
                                    resolve_entities=True)[0]
                raw = self.comments_pat.sub('', raw)
                if not check and '<img' not in raw.lower():
                    return ans

                try:
                    root = html.fromstring(raw)
                except:
                    root = fromstring(raw)

                elems = []
                for body in root.xpath('//body'):
                    if body.text:
                        elems.append(body.text)
                    elems += [html.tostring(x, encoding=unicode) for x in body if
                        x.tag not in ('script', 'style')]

                if len(elems) > 1:
                    ans = u'<div>%s</div>'%(u''.join(elems))
                else:
                    ans = u''.join(elems)
                    if not ans.startswith('<'):
                        ans = '<p>%s</p>'%ans
                ans = xml_replace_entities(ans)
            except:
                import traceback
                traceback.print_exc()

            return ans

Example #2

0

Show file

File: comments_editor.py Project: Gondulf/calibre

        def fget(self):
            ans = u''
            try:
                check = unicode(self.page().mainFrame().toPlainText()).strip()
                raw = unicode(self.page().mainFrame().toHtml())
                raw = xml_to_unicode(raw, strip_encoding_pats=True,
                                    resolve_entities=True)[0]
                raw = self.comments_pat.sub('', raw)
                if not check and '<img' not in raw.lower():
                    return ans

                try:
                    root = html.fromstring(raw)
                except:
                    root = fromstring(raw)

                elems = []
                for body in root.xpath('//body'):
                    if body.text:
                        elems.append(body.text)
                    elems += [html.tostring(x, encoding=unicode) for x in body if
                        x.tag not in ('script', 'style')]

                if len(elems) > 1:
                    ans = u'<div>%s</div>'%(u''.join(elems))
                else:
                    ans = u''.join(elems)
                    if not ans.startswith('<'):
                        ans = '<p>%s</p>'%ans
                ans = xml_replace_entities(ans)
            except:
                import traceback
                traceback.print_exc()

            return ans

Example #3

0

Show file

File: overdrive.py Project: artbycrunk/calibre

    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.soupparser import fromstring
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
            root = fromstring(raw)
        except:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {'english':'eng', 'french':'fra', 'german':'deu',
                    'spanish':'spa'}.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding=unicode).strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None

Example #4

0

Show file

def _html4_parse(data, prefer_soup=False):
    if prefer_soup:
        from calibre.utils.soupparser import fromstring
        data = fromstring(data)
    else:
        data = html.fromstring(data)
    data.attrib.pop('xmlns', None)
    for elem in data.iter(tag=etree.Comment):
        if elem.text:
            elem.text = elem.text.strip('-')
    data = etree.tostring(data, encoding=unicode)

    # Setting huge_tree=True causes crashes in windows with large files
    parser = etree.XMLParser(no_network=True)
    try:
        data = etree.fromstring(data, parser=parser)
    except etree.XMLSyntaxError:
        data = etree.fromstring(data, parser=RECOVER_PARSER)
    return data

Example #5

0

Show file

File: parse_utils.py Project: aimylios/calibre

def _html4_parse(data, prefer_soup=False):
    if prefer_soup:
        from calibre.utils.soupparser import fromstring
        data = fromstring(data)
    else:
        data = html.fromstring(data)
    data.attrib.pop('xmlns', None)
    for elem in data.iter(tag=etree.Comment):
        if elem.text:
            elem.text = elem.text.strip('-')
    data = etree.tostring(data, encoding=unicode)

    # Setting huge_tree=True causes crashes in windows with large files
    parser = etree.XMLParser(no_network=True)
    try:
        data = etree.fromstring(data, parser=parser)
    except etree.XMLSyntaxError:
        data = etree.fromstring(data, parser=RECOVER_PARSER)
    return data

Example #6

0

Show file

File: comments_editor.py Project: tletnes/calibre

        def fget(self):
            ans = u''
            try:
                if not self.page().mainFrame().documentElement().findFirst(
                        'meta[name="calibre-dont-sanitize"]').isNull():
                    # Bypass cleanup if special meta tag exists
                    return unicode_type(self.page().mainFrame().toHtml())
                check = unicode_type(
                    self.page().mainFrame().toPlainText()).strip()
                raw = unicode_type(self.page().mainFrame().toHtml())
                raw = xml_to_unicode(raw,
                                     strip_encoding_pats=True,
                                     resolve_entities=True)[0]
                raw = self.comments_pat.sub('', raw)
                if not check and '<img' not in raw.lower():
                    return ans

                try:
                    root = html.fromstring(raw)
                except:
                    root = fromstring(raw)

                elems = []
                for body in root.xpath('//body'):
                    if body.text:
                        elems.append(body.text)
                    elems += [
                        html.tostring(x, encoding=unicode_type) for x in body
                        if x.tag not in ('script', 'style')
                    ]

                if len(elems) > 1:
                    ans = u'<div>%s</div>' % (u''.join(elems))
                else:
                    ans = u''.join(elems)
                    if not ans.startswith('<'):
                        ans = '<p>%s</p>' % ans
                ans = xml_replace_entities(ans)
            except:
                import traceback
                traceback.print_exc()

            return ans

Example #7

0

Show file

File: mobi6.py Project: wh0197m/calibre

    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        self.check_for_drm()
        processed_records = self.extract_text()
        if self.debug is not None:
            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec,
            'ignore')
        self.processed_html = self.processed_html.replace('</</', '</')
        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
                self.processed_html)
        self.processed_html = self.processed_html.replace(u'\ufeff', '')
        # Remove tags of the form <xyz: ...> as they can cause issues further
        # along the pipeline
        self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
                self.processed_html)

        for pat in ENCODING_PATS:
            self.processed_html = pat.sub('', self.processed_html)
        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
            self.processed_html)
        self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()

        self.log.debug('Parsing HTML...')
        self.processed_html = clean_ascii_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
        except:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
            from calibre.utils.soupparser import fromstring
            self.log.warning('Malformed markup, parsing using BeautifulSoup')
            try:
                root = fromstring(self.processed_html)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
                root = fromstring(self.processed_html)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
                root = fromstring(self.processed_html)

        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
                child.getparent().remove(child)
                bod.append(child)
            root = nroot

        htmls = list(root.xpath('//html'))

        if len(htmls) > 1:
            self.log.warn('Markup contains multiple <html> tags, merging.')
            # Merge all <head> and <body> sections
            for h in htmls:
                p = h.getparent()
                if hasattr(p, 'remove'):
                    p.remove(h)
            bodies, heads = root.xpath('//body'), root.xpath('//head')
            for x in root:
                root.remove(x)
            head, body = map(root.makeelement, ('head', 'body'))
            for h in heads:
                for x in h:
                    h.remove(x)
                    head.append(x)
            for b in bodies:
                for x in b:
                    b.remove(x)
                    body.append(x)
            root.append(head), root.append(body)
        for x in root.xpath('//script'):
            x.getparent().remove(x)

        head = root.xpath('//head')
        if head:
            head = head[0]
        else:
            head = root.makeelement('head', {})
            root.insert(0, head)
        head.text = '\n\t'
        link = head.makeelement('link', {'type':'text/css',
            'href':'styles.css', 'rel':'stylesheet'})
        head.insert(0, link)
        link.tail = '\n\t'
        title = head.xpath('descendant::title')
        m = head.makeelement('meta', {'http-equiv':'Content-Type',
            'content':'text/html; charset=utf-8'})
        head.insert(0, m)
        if not title:
            title = head.makeelement('title', {})
            try:
                title.text = self.book_header.title
            except ValueError:
                title.text = clean_ascii_chars(self.book_header.title)
            title.tail = '\n\t'
            head.insert(0, title)
            head.text = '\n\t'

        self.upshift_markup(root)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None
        metadata_elems = root.xpath('//metadata')
        if metadata_elems and self.book_header.exth is None:
            self.read_embedded_metadata(root, metadata_elems[0], guide)
        for elem in guides + metadata_elems:
            elem.getparent().remove(elem)
        htmlfile = os.path.join(output_dir, 'index.html')
        try:
            for ref in guide.xpath('descendant::reference'):
                if 'href' in ref.attrib:
                    ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
        except AttributeError:
            pass

        def write_as_utf8(path, data):
            if isinstance(data, unicode_type):
                data = data.encode('utf-8')
            with lopen(path, 'wb') as f:
                f.write(data)

        parse_cache[htmlfile] = root
        self.htmlfile = htmlfile
        ncx = cStringIO.StringIO()
        opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
        self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
        opf.render(lopen(self.created_opf_path, 'wb'), ncx,
            ncx_manifest_entry=ncx_manifest_entry)
        ncx = ncx.getvalue()
        if ncx:
            ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
            write_as_utf8(ncx_path, ncx)

        css = [self.base_css_rules, '\n\n']
        for cls, rule in self.tag_css_rules.items():
            css.append('.%s { %s }\n\n' % (cls, rule))
        write_as_utf8('styles.css', ''.join(css))

        if self.book_header.exth is not None or self.embedded_mi is not None:
            self.log.debug('Creating OPF...')
            ncx = cStringIO.StringIO()
            opf, ncx_manifest_entry  = self.create_opf(htmlfile, guide, root)
            opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
                ncx_manifest_entry)
            ncx = ncx.getvalue()
            if ncx:
                write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)

Example #8

0

Show file

    def prepare_xhtml_pt1(self, f):
        if FIX_CHARACTER_ENCODING:
            new_data = f.data

            if new_data.startswith(b"\xef\xbb\xbf"):
                new_data = new_data[3:]
                log.info("Removed UTF-8 BOM from %s" % f.name)

            header = new_data[:1024].decode("utf-8", "ignore")

            for pat in ENCODING_PATS:
                m = re.search(pat, header, re.IGNORECASE)
                if m:
                    enc = m.group(2).lower()
                    if enc == "utf8":
                        enc = "utf-8"

                    break
            else:
                log.info("Assuming UTF-8 encoding for %s" % f.name)
                enc = "utf-8"

            try:
                new_text = new_data.decode("utf-8")
            except UnicodeDecodeError:
                if enc == "utf-8":
                    log.warning("Content failed to decode as UTF-8 in %s" %
                                f.name)
                else:
                    log.info("Changed encoding from %s to UTF-8 in %s" %
                             (enc.upper(), f.name))

                new_text = new_data.decode(enc, errors="replace")

            if enc != "utf-8":
                for pat in ENCODING_PATS:
                    new_text, i = re.subn(pat, r"\1utf-8\3", new_text,
                                          re.IGNORECASE)
                    if i:
                        log.info(
                            "Changed encoding declaration from %s to UTF-8 in %s"
                            % (enc.upper(), f.name))

            if (f.ext == "xhtml" or f.mimetype == "application/xhtml+xml"
                ) and not new_text.strip().startswith("<?xml"):
                new_text = "<?xml version='1.0' encoding='utf-8'?>" + new_text
                log.info("Added XML declaration to %s" % f.name)

            new_text = re.sub(r"<\?xml([^\?]*?)\?><", r"<?xml\1?>\n<",
                              new_text)
            f.data = new_text.encode("utf-8")

        try:
            document = self.parse_xhtml_file(f)
        except Exception:
            return

        body = tfind(document, "body")
        if body is not None:
            f.body_id = body.get("id")

        if self.is_dictionary:
            html = tfind(document, "html")
            for ns, url in DICTIONARY_NSMAP.items():
                if ns not in html.nsmap and ("<%s:" %
                                             ns).encode("utf8") in f.data:
                    f.data = f.data.replace(b"<html",
                                            ("<html xmlns:%s=\"%s\"" %
                                             (ns, url)).encode("utf8"), 1)
                    log.info("Added %s XML namespace to %s" % (ns, f.name))

            if not f.data.startswith(b"<?xml"):
                log.info("Parsing %s as HTML soup" % f.name)
                document = soupparser.fromstring(f.data)
                f.data = etree.tostring(document,
                                        encoding="utf-8",
                                        pretty_print=False,
                                        xml_declaration=False)

Example #9

0

Show file

File: mobi6.py Project: HaraldGustafsson/calibre

    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        self.check_for_drm()
        processed_records = self.extract_text()
        if self.debug is not None:
            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec,
            'ignore')
        self.processed_html = self.processed_html.replace('</</', '</')
        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
                self.processed_html)
        self.processed_html = self.processed_html.replace(u'\ufeff', '')
        # Remove tags of the form <xyz: ...> as they can cause issues further
        # along the pipeline
        self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
                self.processed_html)

        for pat in ENCODING_PATS:
            self.processed_html = pat.sub('', self.processed_html)
        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
            self.processed_html)
        self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()

        self.log.debug('Parsing HTML...')
        self.processed_html = clean_ascii_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
        except:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
            from calibre.utils.soupparser import fromstring
            self.log.warning('Malformed markup, parsing using BeautifulSoup')
            try:
                root = fromstring(self.processed_html)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
                root = fromstring(self.processed_html)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
                root = fromstring(self.processed_html)

        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
                child.getparent().remove(child)
                bod.append(child)
            root = nroot

        htmls = list(root.xpath('//html'))

        if len(htmls) > 1:
            self.log.warn('Markup contains multiple <html> tags, merging.')
            # Merge all <head> and <body> sections
            for h in htmls:
                p = h.getparent()
                if hasattr(p, 'remove'):
                    p.remove(h)
            bodies, heads = root.xpath('//body'), root.xpath('//head')
            for x in root:
                root.remove(x)
            head, body = map(root.makeelement, ('head', 'body'))
            for h in heads:
                for x in h:
                    h.remove(x)
                    head.append(x)
            for b in bodies:
                for x in b:
                    b.remove(x)
                    body.append(x)
            root.append(head), root.append(body)
        for x in root.xpath('//script'):
            x.getparent().remove(x)

        head = root.xpath('//head')
        if head:
            head = head[0]
        else:
            head = root.makeelement('head', {})
            root.insert(0, head)
        head.text = '\n\t'
        link = head.makeelement('link', {'type':'text/css',
            'href':'styles.css', 'rel':'stylesheet'})
        head.insert(0, link)
        link.tail = '\n\t'
        title = head.xpath('descendant::title')
        m = head.makeelement('meta', {'http-equiv':'Content-Type',
            'content':'text/html; charset=utf-8'})
        head.insert(0, m)
        if not title:
            title = head.makeelement('title', {})
            try:
                title.text = self.book_header.title
            except ValueError:
                title.text = clean_ascii_chars(self.book_header.title)
            title.tail = '\n\t'
            head.insert(0, title)
            head.text = '\n\t'

        self.upshift_markup(root)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None
        metadata_elems = root.xpath('//metadata')
        if metadata_elems and self.book_header.exth is None:
            self.read_embedded_metadata(root, metadata_elems[0], guide)
        for elem in guides + metadata_elems:
            elem.getparent().remove(elem)
        htmlfile = os.path.join(output_dir, 'index.html')
        try:
            for ref in guide.xpath('descendant::reference'):
                if 'href' in ref.attrib:
                    ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
        except AttributeError:
            pass
        parse_cache[htmlfile] = root
        self.htmlfile = htmlfile
        ncx = cStringIO.StringIO()
        opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
        self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
        opf.render(open(self.created_opf_path, 'wb'), ncx,
            ncx_manifest_entry=ncx_manifest_entry)
        ncx = ncx.getvalue()
        if ncx:
            ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
            open(ncx_path, 'wb').write(ncx)

        with open('styles.css', 'wb') as s:
            s.write(self.base_css_rules + '\n\n')
            for cls, rule in self.tag_css_rules.items():
                if isinstance(rule, unicode):
                    rule = rule.encode('utf-8')
                s.write('.%s { %s }\n\n' % (cls, rule))

        if self.book_header.exth is not None or self.embedded_mi is not None:
            self.log.debug('Creating OPF...')
            ncx = cStringIO.StringIO()
            opf, ncx_manifest_entry  = self.create_opf(htmlfile, guide, root)
            opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
                ncx_manifest_entry)
            ncx = ncx.getvalue()
            if ncx:
                open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx)

Example #10

0

Show file

File: overdrive.py Project: WilliamRJohns/glacier.io

        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.soupparser import fromstring
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception, e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]
        try:
            root = fromstring(raw)
        except:
            return False

        pub_date = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath(
            "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]"
        )

Example #11

0

Show file

File: overdrive.py Project: rlugojr/calibre

        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.soupparser import fromstring
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception, e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
            root = fromstring(raw)
        except:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass

Example #12

0

Show file

    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.soupparser import fromstring
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]
        try:
            root = fromstring(raw)
        except:
            return False

        pub_date = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath(
            "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]"
        )

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {
                'english': 'eng',
                'french': 'fra',
                'german': 'deu',
                'spanish': 'spa'
            }.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html',
                                 encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None