Example #1
1
    def do_one_isbn_add(self):
        try:
            db = self.gui.library_view.model().db

            try:
                x = self.isbn_books.pop(0)
            except IndexError:
                self.gui.library_view.model().books_added(self.isbn_add_dialog.value)
                self.isbn_add_dialog.accept()
                self.gui.iactions['Edit Metadata'].download_metadata(
                    ids=self.add_by_isbn_ids, ensure_fields=frozenset(['title',
                        'authors']))
                return

            mi = MetaInformation(None)
            mi.isbn = x['isbn']
            if self.isbn_add_tags:
                mi.tags = list(self.isbn_add_tags)
            fmts = [] if x['path'] is None else [x['path']]
            self.add_by_isbn_ids.add(db.import_book(mi, fmts))
            self.isbn_add_dialog.value += 1
            QTimer.singleShot(10, self.do_one_isbn_add)
        except:
            self.isbn_add_dialog.accept()
            raise
Example #2
0
 def setUp(self):
     self.tdir    = PersistentTemporaryDirectory('_calibre_dbtest')
     self.db      = LibraryDatabase2(self.tdir)
     f = open(os.path.join(self.tdir, 'test.txt'), 'w+b')
     f.write('test')
     paths = list(repeat(f, 3))
     formats = list(repeat('txt', 3))
     m1 = MetaInformation('Test Ebook 1', ['Test Author 1'])
     m1.tags = ['tag1', 'tag2']
     m1.publisher = 'Test Publisher 1'
     m1.rating = 2
     m1.series = 'Test Series 1'
     m1.series_index = 3
     m1.author_sort = 'as1'
     m1.isbn = 'isbn1'
     m1.cover_data = ('jpg', self.img)
     m2 = MetaInformation('Test Ebook 2', ['Test Author 2'])
     m2.tags = ['tag3', 'tag4']
     m2.publisher = 'Test Publisher 2'
     m2.rating = 3
     m2.series = 'Test Series 2'
     m2.series_index = 1
     m2.author_sort = 'as1'
     m2.isbn = 'isbn1'
     self.db.add_books(paths, formats, [m1, m2, m2], add_duplicates=True)
     self.m1, self.m2 = m1, m2
Example #3
0
 def setUp(self):
     self.tdir = PersistentTemporaryDirectory('_calibre_dbtest')
     self.db = LibraryDatabase2(self.tdir)
     f = open(os.path.join(self.tdir, 'test.txt'), 'w+b')
     f.write('test')
     paths = list(repeat(f, 3))
     formats = list(repeat('txt', 3))
     m1 = MetaInformation('Test Ebook 1', ['Test Author 1'])
     m1.tags = ['tag1', 'tag2']
     m1.publisher = 'Test Publisher 1'
     m1.rating = 2
     m1.series = 'Test Series 1'
     m1.series_index = 3
     m1.author_sort = 'as1'
     m1.isbn = 'isbn1'
     m1.cover_data = ('jpg', self.img)
     m2 = MetaInformation('Test Ebook 2', ['Test Author 2'])
     m2.tags = ['tag3', 'tag4']
     m2.publisher = 'Test Publisher 2'
     m2.rating = 3
     m2.series = 'Test Series 2'
     m2.series_index = 1
     m2.author_sort = 'as1'
     m2.isbn = 'isbn1'
     self.db.add_books(paths, formats, [m1, m2, m2], add_duplicates=True)
     self.m1, self.m2 = m1, m2
Example #4
0
def get_metadata(stream, extract_cover=True):
    """
    Return metadata as a L{MetaInfo} object
    """
    mi = MetaInformation(None, [_('Unknown')])
    stream.seek(0)

    pheader = PdbHeaderReader(stream)

    # Only Dropbook produced 132 byte record0 files are supported
    if len(pheader.section_data(0)) == 132:
        hr = HeaderRecord(pheader.section_data(0))

        if hr.compression in (2, 10) and hr.has_metadata == 1:
            try:
                mdata = pheader.section_data(hr.metadata_offset)

                mdata = mdata.decode('cp1252', 'replace').split('\x00')
                mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0])
                mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])]
                mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3])
                mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4])
            except Exception:
                pass

            if extract_cover:
                mi.cover_data = get_cover(pheader, hr)

    if not mi.title:
        mi.title = pheader.title if pheader.title else _('Unknown')

    return mi
Example #5
0
File: odt.py Project: sss/calibre
def get_metadata(stream, extract_cover=True):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if data.has_key('title'):
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif data.has_key('creator'):
        mi.authors = string_to_authors(data['creator'])
    if data.has_key('description'):
        mi.comments = data['description']
    if data.has_key('language'):
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()]
    opfmeta = False # we need this later for the cover
    opfnocover = False
    if data.get('opf.metadata','') == 'true':
        # custom metadata contains OPF information
        opfmeta = True
        if data.get('opf.titlesort', ''):
            mi.title_sort = data['opf.titlesort']
        if data.get('opf.authors', ''):
            mi.authors = string_to_authors(data['opf.authors'])
        if data.get('opf.authorsort', ''):
            mi.author_sort = data['opf.authorsort']
        if data.get('opf.isbn', ''):
            isbn = check_isbn(data['opf.isbn'])
            if isbn is not None:
                mi.isbn = isbn
        if data.get('opf.publisher', ''):
            mi.publisher = data['opf.publisher']
        if data.get('opf.pubdate', ''):
            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
        if data.get('opf.series', ''):
            mi.series = data['opf.series']
            if data.get('opf.seriesindex', ''):
                try:
                    mi.series_index = float(data['opf.seriesindex'])
                except ValueError:
                    mi.series_index = 1.0
        if data.get('opf.language', ''):
            cl = canonicalize_lang(data['opf.language'])
            if cl:
                mi.languages = [cl]
        opfnocover = data.get('opf.nocover', 'false') == 'true'
    if not opfnocover:
        try:
            read_cover(stream, zin, mi, opfmeta, extract_cover)
        except:
            pass # Do not let an error reading the cover prevent reading other data

    return mi
Example #6
0
def get_metadata_from_reader(rdr):
    raw = rdr.GetFile(rdr.home)
    home = BeautifulSoup(
        xml_to_unicode(raw, strip_encoding_pats=True,
                       resolve_entities=True)[0])

    title = rdr.title
    try:
        x = rdr.GetEncoding()
        codecs.lookup(x)
        enc = x
    except:
        enc = 'cp1252'
    title = force_unicode(title, enc)
    authors = _get_authors(home)
    mi = MetaInformation(title, authors)
    publisher = _get_publisher(home)
    if publisher:
        mi.publisher = publisher
    isbn = _get_isbn(home)
    if isbn:
        mi.isbn = isbn
    comments = _get_comments(home)
    if comments:
        mi.comments = comments

    cdata = _get_cover(home, rdr)
    if cdata is not None:
        mi.cover_data = ('jpg', cdata)

    return mi
Example #7
0
def get_metadata(stream, extract_cover=True):
    """
    Return metadata as a L{MetaInfo} object
    """
    mi = MetaInformation(None, [_('Unknown')])
    stream.seek(0)

    pheader = PdbHeaderReader(stream)

    # Only Dropbook produced 132 byte record0 files are supported
    if len(pheader.section_data(0)) == 132:
        hr = HeaderRecord(pheader.section_data(0))

        if hr.compression in (2, 10) and hr.has_metadata == 1:
            try:
                mdata = pheader.section_data(hr.metadata_offset)

                mdata = mdata.split('\x00')
                mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0])
                mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])]
                mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3])
                mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4])
            except:
                pass

            if extract_cover:
                mi.cover_data = get_cover(pheader, hr)

    if not mi.title:
        mi.title = pheader.title if pheader.title else _('Unknown')

    return mi
Example #8
0
def get_metadata(stream, extract_cover=True):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if 'title' in data:
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif 'creator' in data:
        mi.authors = string_to_authors(data['creator'])
    if 'description' in data:
        mi.comments = data['description']
    if 'language' in data:
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()]
    opfmeta = False  # we need this later for the cover
    opfnocover = False
    if data.get('opf.metadata','') == 'true':
        # custom metadata contains OPF information
        opfmeta = True
        if data.get('opf.titlesort', ''):
            mi.title_sort = data['opf.titlesort']
        if data.get('opf.authors', ''):
            mi.authors = string_to_authors(data['opf.authors'])
        if data.get('opf.authorsort', ''):
            mi.author_sort = data['opf.authorsort']
        if data.get('opf.isbn', ''):
            isbn = check_isbn(data['opf.isbn'])
            if isbn is not None:
                mi.isbn = isbn
        if data.get('opf.publisher', ''):
            mi.publisher = data['opf.publisher']
        if data.get('opf.pubdate', ''):
            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
        if data.get('opf.series', ''):
            mi.series = data['opf.series']
            if data.get('opf.seriesindex', ''):
                try:
                    mi.series_index = float(data['opf.seriesindex'])
                except ValueError:
                    mi.series_index = 1.0
        if data.get('opf.language', ''):
            cl = canonicalize_lang(data['opf.language'])
            if cl:
                mi.languages = [cl]
        opfnocover = data.get('opf.nocover', 'false') == 'true'
    if not opfnocover:
        try:
            read_cover(stream, zin, mi, opfmeta, extract_cover)
        except:
            pass  # Do not let an error reading the cover prevent reading other data

    return mi
Example #9
0
    def do_one_isbn_add(self):
        try:
            db = self.gui.library_view.model().db

            try:
                x = self.isbn_books.pop(0)
            except IndexError:
                self.gui.library_view.model().books_added(
                    self.isbn_add_dialog.value)
                self.isbn_add_dialog.accept()
                self.gui.iactions['Edit Metadata'].download_metadata(
                    ids=self.add_by_isbn_ids,
                    ensure_fields=frozenset(['title', 'authors']))
                return

            mi = MetaInformation(None)
            mi.isbn = x['isbn']
            if self.isbn_add_tags:
                mi.tags = list(self.isbn_add_tags)
            fmts = [] if x['path'] is None else [x['path']]
            self.add_by_isbn_ids.add(db.import_book(mi, fmts))
            self.isbn_add_dialog.value += 1
            QTimer.singleShot(10, self.do_one_isbn_add)
        except:
            self.isbn_add_dialog.accept()
            raise
Example #10
0
def get_metadata_from_reader(rdr):
    raw = rdr.GetFile(rdr.home)
    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
        resolve_entities=True)[0])

    title = rdr.title
    try:
        x = rdr.GetEncoding()
        codecs.lookup(x)
        enc = x
    except:
        enc = 'cp1252'
    title = force_unicode(title, enc)
    authors = _get_authors(home)
    mi = MetaInformation(title, authors)
    publisher = _get_publisher(home)
    if publisher:
        mi.publisher = publisher
    isbn = _get_isbn(home)
    if isbn:
        mi.isbn = isbn
    comments = _get_comments(home)
    if comments:
        mi.comments = comments

    cdata = _get_cover(home, rdr)
    if cdata is not None:
        mi.cover_data = ('jpg', cdata)

    return mi
Example #11
0
def do_add_empty(db, title, authors, isbn, tags, series, series_index):
    from calibre.ebooks.metadata import MetaInformation
    mi = MetaInformation(None)
    if title is not None:
        mi.title = title
    if authors:
        mi.authors = authors
    if isbn:
        mi.isbn = isbn
    if tags:
        mi.tags = tags
    if series:
        mi.series, mi.series_index = series, series_index
    db.import_book(mi, [])
    write_dirtied(db)
    send_message()
Example #12
0
def do_add_empty(db, title, authors, isbn, tags, series, series_index):
    from calibre.ebooks.metadata import MetaInformation
    mi = MetaInformation(None)
    if title is not None:
        mi.title = title
    if authors:
        mi.authors = authors
    if isbn:
        mi.isbn = isbn
    if tags:
        mi.tags = tags
    if series:
        mi.series, mi.series_index = series, series_index
    db.import_book(mi, [])
    write_dirtied(db)
    send_message()
Example #13
0
def get_metadata(stream, extract_cover=True):
    """ Return metadata as a L{MetaInfo} object """
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)

    pml = b''
    if stream.name.endswith('.pmlz'):
        with TemporaryDirectory('_unpmlz') as tdir:
            zf = ZipFile(stream)
            zf.extractall(tdir)

            pmls = glob.glob(os.path.join(tdir, '*.pml'))
            for p in pmls:
                with open(p, 'r+b') as p_stream:
                    pml += p_stream.read()
            if extract_cover:
                mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], tdir, True)
    else:
        pml = stream.read()
        if extract_cover:
            mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], os.path.abspath(os.path.dirname(stream.name)))

    for comment in re.findall(br'(?ms)\\v.*?\\v', pml):
        m = re.search(br'TITLE="(.*?)"', comment)
        if m:
            mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
        m = re.search(br'AUTHOR="(.*?)"', comment)
        if m:
            if mi.authors == [_('Unknown')]:
                mi.authors = []
            mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
        m = re.search(br'PUBLISHER="(.*?)"', comment)
        if m:
            mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
        m = re.search(br'COPYRIGHT="(.*?)"', comment)
        if m:
            mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
        m = re.search(br'ISBN="(.*?)"', comment)
        if m:
            mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))

    return mi
Example #14
0
def get_metadata(stream, extract_cover=True):
    """ Return metadata as a L{MetaInfo} object """
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)

    pml = ''
    if stream.name.endswith('.pmlz'):
        with TemporaryDirectory('_unpmlz') as tdir:
            zf = ZipFile(stream)
            zf.extractall(tdir)

            pmls = glob.glob(os.path.join(tdir, '*.pml'))
            for p in pmls:
                with open(p, 'r+b') as p_stream:
                    pml += p_stream.read()
            if extract_cover:
                mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], tdir, True)
    else:
        pml = stream.read()
        if extract_cover:
            mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], os.path.abspath(os.path.dirname(stream.name)))

    for comment in re.findall(r'(?mus)\\v.*?\\v', pml):
        m = re.search(r'TITLE="(.*?)"', comment)
        if m:
            mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
        m = re.search(r'AUTHOR="(.*?)"', comment)
        if m:
            if mi.authors == [_('Unknown')]:
                mi.authors = []
            mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))))
        m = re.search(r'PUBLISHER="(.*?)"', comment)
        if m:
            mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
        m = re.search(r'COPYRIGHT="(.*?)"', comment)
        if m:
            mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))
        m = re.search(r'ISBN="(.*?)"', comment)
        if m:
            mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))

    return mi
Example #15
0
def do_add_empty(dbctx, title, authors, isbn, tags, series, series_index,
                 cover, identifiers, languages):
    mi = MetaInformation(None)
    if title is not None:
        mi.title = title
    if authors:
        mi.authors = authors
    if identifiers:
        mi.set_identifiers(identifiers)
    if isbn:
        mi.isbn = isbn
    if tags:
        mi.tags = tags
    if series:
        mi.series, mi.series_index = series, series_index
    if cover:
        mi.cover = cover
    if languages:
        mi.languages = languages
    ids, duplicates = dbctx.run('add', 'empty', read_cover(mi))
    prints(_('Added book ids: %s') % ','.join(map(str, ids)))
Example #16
0
def do_add_empty(
    dbctx, title, authors, isbn, tags, series, series_index, cover, identifiers,
    languages
):
    mi = MetaInformation(None)
    if title is not None:
        mi.title = title
    if authors:
        mi.authors = authors
    if identifiers:
        mi.set_identifiers(identifiers)
    if isbn:
        mi.isbn = isbn
    if tags:
        mi.tags = tags
    if series:
        mi.series, mi.series_index = series, series_index
    if cover:
        mi.cover = cover
    if languages:
        mi.languages = languages
    ids, duplicates = dbctx.run('add', 'empty', read_cover(mi))
    prints(_('Added book ids: %s') % ','.join(map(str, ids)))
Example #17
0
 def populate(self, entries, browser, verbose=False, api_key=''):
     for x in entries:
         try:
             id_url = entry_id(x)[0].text
             title = self.get_title(x)
         except:
             report(verbose)
         mi = MetaInformation(title, self.get_authors(x))
         try:
             if api_key != '':
                 id_url = id_url + "?apikey=" + api_key
             raw = browser.open(id_url).read()
             feed = etree.fromstring(raw)
             x = entry(feed)[0]
         except Exception, e:
             if verbose:
                 print 'Failed to get all details for an entry'
                 print e
         mi.comments = self.get_description(x, verbose)
         mi.tags = self.get_tags(x, verbose)
         mi.isbn = self.get_isbn(x, verbose)
         mi.publisher = self.get_publisher(x, verbose)
         mi.pubdate = self.get_date(x, verbose)
         self.append(mi)
Example #18
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:
                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)
                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Example #19
0
def get_metadata_(src, encoding=None):
    if not isinstance(src, unicode):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')

    # Meta data definitions as in
    # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    # Title
    title = None
    pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    src = src[:150000]  # Searching shouldn't take too long
    match = pat.search(src)
    if match:
        title = match.group(2)
    else:
        for x in ('DC.title', 'DCTERMS.title', 'Title'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                title = match.group(1)
                break
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = match.group(1)

    # Author
    author = None
    pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        author = match.group(2).replace(',', ';')
    else:
        for x in ('Author', 'DC.creator.aut', 'DCTERMS.creator.aut',
                  'DC.creator'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                author = match.group(1)
                break

    # Create MetaInformation with Title and Author
    ent_pat = re.compile(r'&(\S+)?;')
    if title:
        title = ent_pat.sub(entity_to_unicode, title)
    if author:
        author = ent_pat.sub(entity_to_unicode, author)
    mi = MetaInformation(title, [author] if author else None)

    # Publisher
    publisher = None
    pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->',
                     re.DOTALL)
    match = pat.search(src)
    if match:
        publisher = match.group(2)
    else:
        for x in ('Publisher', 'DC.publisher', 'DCTERMS.publisher'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                publisher = match.group(1)
                break
    if publisher:
        mi.publisher = ent_pat.sub(entity_to_unicode, publisher)

    # ISBN
    isbn = None
    pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        isbn = match.group(1)
    else:
        for x in ('ISBN', 'DC.identifier.ISBN', 'DCTERMS.identifier.ISBN'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                isbn = match.group(1)
                break
    if isbn:
        mi.isbn = re.sub(r'[^0-9xX]', '', isbn)

    # LANGUAGE
    language = None
    pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        language = match.group(1)
    else:
        for x in ('DC.language', 'DCTERMS.language'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                language = match.group(1)
                break
    if language:
        mi.language = language

    # PUBDATE
    pubdate = None
    pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        pubdate = match.group(1)
    else:
        for x in ('Pubdate', 'Date of publication', 'DC.date.published',
                  'DC.date.publication', 'DC.date.issued', 'DCTERMS.issued'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                pubdate = match.group(1)
                break
    if pubdate:
        try:
            mi.pubdate = parse_date(pubdate)
        except:
            pass

    # TIMESTAMP
    timestamp = None
    pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        timestamp = match.group(1)
    else:
        for x in ('Timestamp', 'Date of creation', 'DC.date.created',
                  'DC.date.creation', 'DCTERMS.created'):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                timestamp = match.group(1)
                break
    if timestamp:
        try:
            mi.timestamp = parse_date(timestamp)
        except:
            pass

    # SERIES
    series = None
    pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        series = match.group(1)
    else:
        pat = get_meta_regexp_("Series")
        match = pat.search(src)
        if match:
            series = match.group(1)
    if series:
        pat = re.compile(r'\[([.0-9]+)\]')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()

        mi.series = ent_pat.sub(entity_to_unicode, series)
        if series_index is None:
            pat = get_meta_regexp_("Seriesnumber")
            match = pat.search(src)
            if match:
                try:
                    series_index = float(match.group(1))
                except:
                    pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = None
    pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        rating = match.group(1)
    else:
        pat = get_meta_regexp_("Rating")
        match = pat.search(src)
        if match:
            rating = match.group(1)
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # COMMENTS
    comments = None
    pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        comments = match.group(1)
    else:
        pat = get_meta_regexp_("Comments")
        match = pat.search(src)
        if match:
            comments = match.group(1)
    if comments:
        mi.comments = ent_pat.sub(entity_to_unicode, comments)

    # TAGS
    tags = None
    pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        tags = match.group(1)
    else:
        pat = get_meta_regexp_("Tags")
        match = pat.search(src)
        if match:
            tags = match.group(1)
    if tags:
        mi.tags = [
            x.strip() for x in ent_pat.sub(entity_to_unicode, tags).split(",")
        ]

    # Ready to return MetaInformation
    return mi
Example #20
0
    def isbn_api_add(self, isbn):
        '''
        Add the book and return the new db id.
        '''

        try:
            isbn = int(re.sub(r'[^\d]+', '', isbn))
        except:
            raise cherrypy.HTTPError(404, 'Invalid isbn %s is not a number: '%isbn)
        isbn_len = len(str(isbn))
        if isbn_len != 10 and isbn_len != 13:
            raise cherrypy.HTTPError(404, 'Invalid isbn {0} has a wrong length of {1}'.format(isbn, isbn_len))
        mi = MetaInformation(None)
        mi.isbn = str(isbn)
        fmts = []
        new_id = 0
        try:
            new_id = self.db.import_book(mi, fmts)
        except:
            return 'could not add new book with isdb {0}'.format(isbn)

        # Start the threaded download of metadata and return with the id of the added book
        result = ''
        try:
            ids = []
            ids.append(new_id)

            buf = BytesIO()
            log = create_log(buf)
            abort = Event()

            authors = []
            identifiers = {}
            identifiers['isbn'] = mi.isbn

            results = identify(log, abort, title=None, authors=authors,
                    identifiers=identifiers, timeout=int(30000))

            if not results:
                print (log, file=sys.stderr)
                prints('No results found', file=sys.stderr)
                return 'Could not find metadata for isbn {0}'.format(isbn)
            result = results[0]
            self.db.set_metadata(new_id, result)

            #cf = None
            #
            #if opts.cover and results:
            #    cover = download_cover(log, title=None, authors=authors,
            #            identifiers=result.identifiers, timeout=int(30000))
            #    if cover is None and not opts.opf:
            #        prints('No cover found', file=sys.stderr)
            #    else:
            #        save_cover_data_to(cover[-1], opts.cover)
            #        result.cover = cf = opts.cover

            log = buf.getvalue()

            result = unicode(result).encode('utf-8')

        except e:
            return 'Error getting metadata {0}'.format(e)

        return 'Added new book with isbn {0} with new id {1} and metadata {2}'.format(isbn, new_id, result)

    # }}}
Example #21
0
def get_metadata_(src, encoding=None):
    if not isinstance(src, unicode):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, "replace")

    # Meta data definitions as in
    # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    # Title
    title = None
    pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    src = src[:150000]  # Searching shouldn't take too long
    match = pat.search(src)
    if match:
        title = match.group(2)
    else:
        for x in ("DC.title", "DCTERMS.title", "Title"):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                title = match.group(1)
                break
    if not title:
        pat = re.compile("<title>([^<>]+?)</title>", re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = match.group(1)

    # Author
    author = None
    pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        author = match.group(2).replace(",", ";")
    else:
        for x in ("Author", "DC.creator.aut", "DCTERMS.creator.aut", "DC.creator"):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                author = match.group(1)
                break

    # Create MetaInformation with Title and Author
    ent_pat = re.compile(r"&(\S+)?;")
    if title:
        title = ent_pat.sub(entity_to_unicode, title)
    if author:
        author = ent_pat.sub(entity_to_unicode, author)
    mi = MetaInformation(title, [author] if author else None)

    # Publisher
    publisher = None
    pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        publisher = match.group(2)
    else:
        for x in ("Publisher", "DC.publisher", "DCTERMS.publisher"):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                publisher = match.group(1)
                break
    if publisher:
        mi.publisher = ent_pat.sub(entity_to_unicode, publisher)

    # ISBN
    isbn = None
    pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        isbn = match.group(1)
    else:
        for x in ("ISBN", "DC.identifier.ISBN", "DCTERMS.identifier.ISBN"):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                isbn = match.group(1)
                break
    if isbn:
        mi.isbn = re.sub(r"[^0-9xX]", "", isbn)

    # LANGUAGE
    language = None
    pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        language = match.group(1)
    else:
        for x in ("DC.language", "DCTERMS.language"):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                language = match.group(1)
                break
    if language:
        mi.language = language

    # PUBDATE
    pubdate = None
    pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        pubdate = match.group(1)
    else:
        for x in (
            "Pubdate",
            "Date of publication",
            "DC.date.published",
            "DC.date.publication",
            "DC.date.issued",
            "DCTERMS.issued",
        ):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                pubdate = match.group(1)
                break
    if pubdate:
        try:
            mi.pubdate = parse_date(pubdate)
        except:
            pass

    # TIMESTAMP
    timestamp = None
    pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        timestamp = match.group(1)
    else:
        for x in ("Timestamp", "Date of creation", "DC.date.created", "DC.date.creation", "DCTERMS.created"):
            pat = get_meta_regexp_(x)
            match = pat.search(src)
            if match:
                timestamp = match.group(1)
                break
    if timestamp:
        try:
            mi.timestamp = parse_date(timestamp)
        except:
            pass

    # SERIES
    series = None
    pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        series = match.group(1)
    else:
        pat = get_meta_regexp_("Series")
        match = pat.search(src)
        if match:
            series = match.group(1)
    if series:
        pat = re.compile(r"\[([.0-9]+)\]")
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), "").strip()

        mi.series = ent_pat.sub(entity_to_unicode, series)
        if series_index is None:
            pat = get_meta_regexp_("Seriesnumber")
            match = pat.search(src)
            if match:
                try:
                    series_index = float(match.group(1))
                except:
                    pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = None
    pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        rating = match.group(1)
    else:
        pat = get_meta_regexp_("Rating")
        match = pat.search(src)
        if match:
            rating = match.group(1)
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.0
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # COMMENTS
    comments = None
    pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        comments = match.group(1)
    else:
        pat = get_meta_regexp_("Comments")
        match = pat.search(src)
        if match:
            comments = match.group(1)
    if comments:
        mi.comments = ent_pat.sub(entity_to_unicode, comments)

    # TAGS
    tags = None
    pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
    match = pat.search(src)
    if match:
        tags = match.group(1)
    else:
        pat = get_meta_regexp_("Tags")
        match = pat.search(src)
        if match:
            tags = match.group(1)
    if tags:
        mi.tags = [x.strip() for x in ent_pat.sub(entity_to_unicode, tags).split(",")]

    # Ready to return MetaInformation
    return mi
Example #22
0
def get_metadata(stream, extract_cover=True):
    whitespace = re.compile(r'\s+')

    def normalize(s):
        return whitespace.sub(' ', s).strip()

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
        root = fromstring(meta)

        def find(field):
            ns, tag = fields[field]
            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
            if ans:
                return normalize(
                    tostring(ans[0],
                             method='text',
                             encoding='unicode',
                             with_tail=False)).strip()

        mi = MetaInformation(None, [])
        title = find('title')
        if title:
            mi.title = title
        creator = find('initial-creator') or find('creator')
        if creator:
            mi.authors = string_to_authors(creator)
        desc = find('description')
        if desc:
            mi.comments = desc
        lang = find('language')
        if lang and canonicalize_lang(lang):
            mi.languages = [canonicalize_lang(lang)]
        kw = find('keyword') or find('keywords')
        if kw:
            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
        data = {}
        for tag in root.xpath('//ns0:user-defined',
                              namespaces={'ns0': fields['user-defined'][0]}):
            name = (tag.get('{%s}name' % METANS) or '').lower()
            vtype = tag.get('{%s}value-type' % METANS) or 'string'
            val = tag.text
            if name and val:
                if vtype == 'boolean':
                    val = val == 'true'
                data[name] = val
        opfmeta = False  # we need this later for the cover
        opfnocover = False
        if data.get('opf.metadata'):
            # custom metadata contains OPF information
            opfmeta = True
            if data.get('opf.titlesort', ''):
                mi.title_sort = data['opf.titlesort']
            if data.get('opf.authors', ''):
                mi.authors = string_to_authors(data['opf.authors'])
            if data.get('opf.authorsort', ''):
                mi.author_sort = data['opf.authorsort']
            if data.get('opf.isbn', ''):
                isbn = check_isbn(data['opf.isbn'])
                if isbn is not None:
                    mi.isbn = isbn
            if data.get('opf.publisher', ''):
                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.identifiers'):
                try:
                    mi.identifiers = json.loads(data['opf.identifiers'])
                except Exception:
                    pass
            if data.get('opf.rating'):
                try:
                    mi.rating = max(0, min(float(data['opf.rating']), 10))
                except Exception:
                    pass
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data

    return mi
Example #23
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:

                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)

                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Example #24
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, "replace")
    name = name.rpartition(".")[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get("filename_pattern"))
    name = name.replace("_", " ")
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group("title")
        except IndexError:
            pass
        try:
            au = match.group("author")
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs["swap_author_names"] and mi.authors:

                    def swap(a):
                        if "," in a:
                            parts = a.split(",", 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return " ".join(parts)

                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group("series")
        except IndexError:
            pass
        try:
            si = match.group("series_index")
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group("isbn")
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group("publisher")
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group("published")
            if pubdate:
                from calibre.utils.date import parse_only_date

                mi.pubdate = parse_only_date(pubdate)
        except:
            pass

    if mi.is_null("title"):
        mi.title = name
    return mi