Example #1
0
File: odt.py Project: sss/calibre
def get_metadata(stream, extract_cover=True):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if data.has_key('title'):
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif data.has_key('creator'):
        mi.authors = string_to_authors(data['creator'])
    if data.has_key('description'):
        mi.comments = data['description']
    if data.has_key('language'):
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()]
    opfmeta = False # we need this later for the cover
    opfnocover = False
    if data.get('opf.metadata','') == 'true':
        # custom metadata contains OPF information
        opfmeta = True
        if data.get('opf.titlesort', ''):
            mi.title_sort = data['opf.titlesort']
        if data.get('opf.authors', ''):
            mi.authors = string_to_authors(data['opf.authors'])
        if data.get('opf.authorsort', ''):
            mi.author_sort = data['opf.authorsort']
        if data.get('opf.isbn', ''):
            isbn = check_isbn(data['opf.isbn'])
            if isbn is not None:
                mi.isbn = isbn
        if data.get('opf.publisher', ''):
            mi.publisher = data['opf.publisher']
        if data.get('opf.pubdate', ''):
            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
        if data.get('opf.series', ''):
            mi.series = data['opf.series']
            if data.get('opf.seriesindex', ''):
                try:
                    mi.series_index = float(data['opf.seriesindex'])
                except ValueError:
                    mi.series_index = 1.0
        if data.get('opf.language', ''):
            cl = canonicalize_lang(data['opf.language'])
            if cl:
                mi.languages = [cl]
        opfnocover = data.get('opf.nocover', 'false') == 'true'
    if not opfnocover:
        try:
            read_cover(stream, zin, mi, opfmeta, extract_cover)
        except:
            pass # Do not let an error reading the cover prevent reading other data

    return mi
Example #2
0
def get_metadata(stream, extract_cover=True):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if 'title' in data:
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif 'creator' in data:
        mi.authors = string_to_authors(data['creator'])
    if 'description' in data:
        mi.comments = data['description']
    if 'language' in data:
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()]
    opfmeta = False  # we need this later for the cover
    opfnocover = False
    if data.get('opf.metadata','') == 'true':
        # custom metadata contains OPF information
        opfmeta = True
        if data.get('opf.titlesort', ''):
            mi.title_sort = data['opf.titlesort']
        if data.get('opf.authors', ''):
            mi.authors = string_to_authors(data['opf.authors'])
        if data.get('opf.authorsort', ''):
            mi.author_sort = data['opf.authorsort']
        if data.get('opf.isbn', ''):
            isbn = check_isbn(data['opf.isbn'])
            if isbn is not None:
                mi.isbn = isbn
        if data.get('opf.publisher', ''):
            mi.publisher = data['opf.publisher']
        if data.get('opf.pubdate', ''):
            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
        if data.get('opf.series', ''):
            mi.series = data['opf.series']
            if data.get('opf.seriesindex', ''):
                try:
                    mi.series_index = float(data['opf.seriesindex'])
                except ValueError:
                    mi.series_index = 1.0
        if data.get('opf.language', ''):
            cl = canonicalize_lang(data['opf.language'])
            if cl:
                mi.languages = [cl]
        opfnocover = data.get('opf.nocover', 'false') == 'true'
    if not opfnocover:
        try:
            read_cover(stream, zin, mi, opfmeta, extract_cover)
        except:
            pass  # Do not let an error reading the cover prevent reading other data

    return mi
Example #3
0
def get_metadata_(path, cpath=None):
    p = podofo.PDFDoc()
    p.open(path)
    title = p.title
    if not title:
        title = '_'
    author = p.author
    authors = string_to_authors(author) if author else  [_('Unknown')]
    creator = p.creator
    try:
        tags = [x.strip() for x in p.keywords.split(u',')]
        tags = [x for x in tags if x]
    except:
        tags = []
    ok = True
    try:
        if cpath is not None:
            pages = p.pages
            if pages < 1:
                raise ValueError('PDF has no pages')
            if True or pages == 1:
                shutil.copyfile(path, cpath)
            else:
                p.extract_first_page()
                p.save(cpath)
    except:
        import traceback
        traceback.print_exc()
        ok = False

    return (title, authors, creator, tags, ok)
Example #4
0
def get_metadata_(path, cpath=None):
    p = podofo.PDFDoc()
    p.open(path)
    title = p.title
    if not title:
        title = '_'
    author = p.author
    authors = string_to_authors(author) if author else [_('Unknown')]
    creator = p.creator
    try:
        tags = [x.strip() for x in p.keywords.split(u',')]
        tags = [x for x in tags if x]
    except:
        tags = []
    ok = True
    try:
        if cpath is not None:
            pages = p.pages
            if pages < 1:
                raise ValueError('PDF has no pages')
            if True or pages == 1:
                shutil.copyfile(path, cpath)
            else:
                p.extract_first_page()
                p.save(cpath)
    except:
        import traceback
        traceback.print_exc()
        ok = False

    return (title, authors, creator, tags, ok)
Example #5
0
def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
    title = 'Unknown'
    mi = MetaInformation(title, ['Unknown'])
    stream.seek(0)
    try:
        if stream.read(10) not in MAGIC:
            print >>sys.stderr, u'Couldn\'t read IMP header from file'
            return mi

        def cString(skip=0):
            result = ''
            while 1:
                data = stream.read(1)
                if data == '\x00':
                    if not skip: return result
                    skip -= 1
                    result, data = '', ''
                result += data

        stream.read(38) # skip past some uninteresting headers
        _, category, title, author = cString(), cString(), cString(1), cString(2)

        if title:
            mi.title = title
        if author:
            mi.authors = string_to_authors(author)
            mi.author = author
        if category:
            mi.category = category
    except Exception as err:
        msg = u'Couldn\'t read metadata from imp: %s with error %s'%(mi.title, unicode(err))
        print >>sys.stderr, msg.encode('utf8')
    return mi
Example #6
0
 def cell_changed(self, row, col):
     id_ = int(self.table.item(row, 0).data(Qt.UserRole))
     if col == 0:
         item = self.table.item(row, 0)
         aut = unicode_type(item.text()).strip()
         aut_list = string_to_authors(aut)
         if len(aut_list) != 1:
             error_dialog(
                 self.parent(), _('Invalid author name'),
                 _('You cannot change an author to multiple authors.')
             ).exec_()
             aut = ' % '.join(aut_list)
             self.table.item(row, 0).setText(aut)
         item.set_sort_key()
         self.authors[id_]['name'] = aut
         self.set_icon(item, id_)
         c = self.table.item(row, 1)
         txt = author_to_author_sort(aut)
         self.authors[id_]['sort'] = txt
         c.setText(txt)  # This triggers another cellChanged event
         item = c
     else:
         item = self.table.item(row, col)
         item.set_sort_key()
         self.set_icon(item, id_)
         self.authors[id_][self.get_column_name(col)] = unicode_type(
             item.text())
     self.table.setCurrentItem(item)
     self.table.scrollToItem(item)
Example #7
0
def _read_doc_props(raw, mi):
    from calibre.ebooks.metadata import string_to_authors
    root = etree.fromstring(raw, parser=RECOVER_PARSER)
    titles = XPath('//dc:title')(root)
    if titles:
        title = titles[0].text
        if title and title.strip():
            mi.title = title.strip()
    tags = []
    for subject in XPath('//dc:subject')(root):
        if subject.text and subject.text.strip():
            tags.append(subject.text.strip().replace(',', '_'))
    for keywords in XPath('//cp:keywords')(root):
        if keywords.text and keywords.text.strip():
            for x in keywords.text.split():
                tags.extend(y.strip() for y in x.split(','))
    if tags:
        mi.tags = tags
    authors = XPath('//dc:creator')(root)
    aut = []
    for author in authors:
        if author.text and author.text.strip():
            aut.extend(string_to_authors(author.text))
    if aut:
        mi.authors = aut

    desc = XPath('//dc:description')(root)
    if desc:
        raw = etree.tostring(desc[0], method='text', encoding=unicode)
        mi.comments = raw
Example #8
0
 def get_title_and_authors(self):
     title = unicode(self.title.text()).strip()
     if not title:
         title = _('Unknown')
     authors = unicode(self.author.text()).strip()
     authors = string_to_authors(authors) if authors else [_('Unknown')]
     return title, authors
Example #9
0
def _get_authors(soup):
    aut = (_metadata_from_span(soup, r'author')
           or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
    ans = [_('Unknown')]
    if aut is not None:
        ans = string_to_authors(aut)
    return ans
Example #10
0
 def get_title_and_authors(self):
     title = str(self.title.text()).strip()
     if not title:
         title = _('Unknown')
     authors = str(self.author.text()).strip()
     authors = string_to_authors(authors) if authors else [_('Unknown')]
     return title, authors
Example #11
0
def get_djvu_metadata(stream, cover=True):
    with TemporaryDirectory('_djvu_metadata_read') as djvupath:
        stream.seek(0)
        with open(os.path.join(djvupath, 'src.djvu'), 'wb') as f:
            shutil.copyfileobj(stream, f)
        try:
            res = fork_job('calibre_plugins.djvu_metadata.djvu', 'get_djvu_metadata_worker', (djvupath, bool(cover)))
        except WorkerError as e:
            prints(e.orig_tb)
            raise RuntimeError('Failed to run djvused')
        info = res['result']
        with open(res['stdout_stderr'], 'rb') as f:
            raw = f.read().strip()
            if raw:
                prints(raw)
        if info is None:
            raise ValueError('Could not read metadata from djvu')
        covpath = os.path.join(djvupath, 'cover.jpg')
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, 'rb') as f:
                cdata = f.read()

    title = info.get('Title', None)
    au = info.get('Author', None)
    if au is None:
        au = [_('Unknown')]
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)

    if cdata:
        mi.cover_data = ('jpg', cdata)

    return mi
Example #12
0
 def cell_changed(self, row, col):
     id_ = int(self.table.item(row, 0).data(Qt.UserRole))
     if col == 0:
         item = self.table.item(row, 0)
         item.setIcon(self.edited_icon)
         aut = unicode_type(item.text()).strip()
         aut_list = string_to_authors(aut)
         if len(aut_list) != 1:
             error_dialog(
                 self.parent(), _('Invalid author name'),
                 _('You cannot change an author to multiple authors.')
             ).exec_()
             aut = ' % '.join(aut_list)
             self.table.item(row, 0).setText(aut)
         self.authors[id_]['name'] = aut
         c = self.table.item(row, 1)
         txt = author_to_author_sort(aut)
         c.setText(txt)
         self.authors[id_]['sort'] = txt
         item = c
     else:
         item = self.table.item(row, col)
         item.setIcon(self.edited_icon)
         if col == 1:
             self.authors[id_]['sort'] = unicode_type(item.text())
         else:
             self.authors[id_]['link'] = unicode_type(item.text())
     self.table.setCurrentItem(item)
     self.table.scrollToItem(item)
Example #13
0
def _read_doc_props(raw, mi):
    from calibre.ebooks.metadata import string_to_authors
    root = etree.fromstring(raw, parser=RECOVER_PARSER)
    titles = XPath('//dc:title')(root)
    if titles:
        title = titles[0].text
        if title and title.strip():
            mi.title = title.strip()
    tags = []
    for subject in XPath('//dc:subject')(root):
        if subject.text and subject.text.strip():
            tags.append(subject.text.strip().replace(',', '_'))
    for keywords in XPath('//cp:keywords')(root):
        if keywords.text and keywords.text.strip():
            for x in keywords.text.split():
                tags.extend(y.strip() for y in x.split(','))
    if tags:
        mi.tags = tags
    authors = XPath('//dc:creator')(root)
    aut = []
    for author in authors:
        if author.text and author.text.strip():
            aut.extend(string_to_authors(author.text))
    if aut:
        mi.authors = aut

    desc = XPath('//dc:description')(root)
    if desc:
        raw = etree.tostring(desc[0], method='text', encoding=unicode)
        mi.comments = raw
Example #14
0
def _get_authors(soup):
    aut = (_metadata_from_span(soup, r'author')
        or _metadata_from_table(soup, r'^\s*by\s*:?\s+'))
    ans = [_('Unknown')]
    if aut is not None:
        ans = string_to_authors(aut)
    return ans
Example #15
0
def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)

    buf = BytesIO()
    log = create_log(buf)
    abort = Event()
    patch_plugins()

    authors = []
    if opts.authors:
        authors = string_to_authors(opts.authors)

    identifiers = {}
    if opts.isbn:
        identifiers['isbn'] = opts.isbn

    allowed_plugins = frozenset(opts.allowed_plugin)
    results = identify(log,
                       abort,
                       title=opts.title,
                       authors=authors,
                       identifiers=identifiers,
                       timeout=int(opts.timeout),
                       allowed_plugins=allowed_plugins or None)

    if not results:
        print(log, file=sys.stderr)
        prints('No results found', file=sys.stderr)
        raise SystemExit(1)
    result = results[0]

    cf = None
    if opts.cover and results:
        cover = download_cover(log,
                               title=opts.title,
                               authors=authors,
                               identifiers=result.identifiers,
                               timeout=int(opts.timeout))
        if cover is None and not opts.opf:
            prints('No cover found', file=sys.stderr)
        else:
            save_cover_data_to(cover[-1], opts.cover)
            result.cover = cf = opts.cover

    log = buf.getvalue()

    result = (metadata_to_opf(result)
              if opts.opf else unicode(result).encode('utf-8'))

    if opts.verbose:
        print(log, file=sys.stderr)

    print(result)
    if not opts.opf and opts.cover:
        prints('Cover               :', cf)

    return 0
Example #16
0
def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)

    buf = BytesIO()
    log = create_log(buf)
    abort = Event()
    patch_plugins()

    authors = []
    if opts.authors:
        authors = string_to_authors(opts.authors)

    identifiers = {}
    for idspec in opts.identifier:
        k, v = idspec.partition(':')[::2]
        if not k or not v:
            raise SystemExit('Not a valid identifier: {}'.format(idspec))
        identifiers[k] = v
    if opts.isbn:
        identifiers['isbn'] = opts.isbn

    allowed_plugins = frozenset(opts.allowed_plugin)
    results = identify(log, abort, title=opts.title, authors=authors,
            identifiers=identifiers, timeout=int(opts.timeout),
            allowed_plugins=allowed_plugins or None)

    if not results:
        prints(buf.getvalue(), file=sys.stderr)
        prints('No results found', file=sys.stderr)
        raise SystemExit(1)
    result = results[0]

    cf = None
    if opts.cover and results:
        cover = download_cover(log, title=opts.title, authors=authors,
                identifiers=result.identifiers, timeout=int(opts.timeout))
        if cover is None:
            if not opts.opf:
                prints('No cover found', file=sys.stderr)
        else:
            save_cover_data_to(cover[-1], opts.cover)
            result.cover = cf = opts.cover

    if opts.verbose:
        prints(buf.getvalue(), file=sys.stderr)

    if opts.opf:
        getattr(sys.stdout, 'buffer', sys.stdout).write(metadata_to_opf(result))
        print()
    else:
        prints(str(result))
    if not opts.opf and opts.cover:
        prints('Cover               :', cf)

    return 0
Example #17
0
def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)

    buf = BytesIO()
    log = create_log(buf)
    abort = Event()
    patch_plugins()

    authors = []
    if opts.authors:
        authors = string_to_authors(opts.authors)

    identifiers = {}
    for idspec in opts.identifier:
        k, v = idspec.partition(':')[::2]
        if not k or not v:
            raise SystemExit('Not a valid identifier: {}'.format(idspec))
        identifiers[k] = v
    if opts.isbn:
        identifiers['isbn'] = opts.isbn

    allowed_plugins = frozenset(opts.allowed_plugin)
    results = identify(log, abort, title=opts.title, authors=authors,
            identifiers=identifiers, timeout=int(opts.timeout),
            allowed_plugins=allowed_plugins or None)

    if not results:
        print(log, file=sys.stderr)
        prints('No results found', file=sys.stderr)
        raise SystemExit(1)
    result = results[0]

    cf = None
    if opts.cover and results:
        cover = download_cover(log, title=opts.title, authors=authors,
                identifiers=result.identifiers, timeout=int(opts.timeout))
        if cover is None and not opts.opf:
            prints('No cover found', file=sys.stderr)
        else:
            save_cover_data_to(cover[-1], opts.cover)
            result.cover = cf = opts.cover

    log = buf.getvalue()

    result = (metadata_to_opf(result) if opts.opf else
                    unicode_type(result).encode('utf-8'))

    if opts.verbose:
        print(log, file=sys.stderr)

    print(result)
    if not opts.opf and opts.cover:
        prints('Cover               :', cf)

    return 0
Example #18
0
File: pdf.py Project: kmshi/calibre
def get_metadata(stream, cover=True):
    with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
        stream.seek(0)
        with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
            shutil.copyfileobj(stream, f)
        try:
            res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
                           (pdfpath, bool(cover)))
        except WorkerError as e:
            prints(e.orig_tb)
            raise RuntimeError('Failed to run pdfinfo')
        info = res['result']
        with open(res['stdout_stderr'], 'rb') as f:
            raw = f.read().strip()
            if raw:
                prints(raw)
        if not info:
            raise ValueError('Could not read info dict from PDF')
        covpath = os.path.join(pdfpath, 'cover.jpg')
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, 'rb') as f:
                cdata = f.read()

    title = info.get('Title', None)
    au = info.get('Author', None)
    if au is None:
        au = [_('Unknown')]
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)
    # if isbn is not None:
    #    mi.isbn = isbn

    creator = info.get('Creator', None)
    if creator:
        mi.book_producer = creator

    keywords = info.get('Keywords', None)
    mi.tags = []
    if keywords:
        mi.tags = [x.strip() for x in keywords.split(',')]
        isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
        if isbn:
            mi.isbn = isbn = isbn[0]
        mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]

    subject = info.get('Subject', None)
    if subject:
        mi.tags.insert(0, subject)

    if cdata:
        mi.cover_data = ('jpeg', cdata)

    return mi
Example #19
0
def get_metadata(stream, cover=True):
    with TemporaryDirectory("_pdf_metadata_read") as pdfpath:
        stream.seek(0)
        with open(os.path.join(pdfpath, "src.pdf"), "wb") as f:
            shutil.copyfileobj(stream, f)
        try:
            res = fork_job("calibre.ebooks.metadata.pdf", "read_info", (pdfpath, bool(cover)))
        except WorkerError as e:
            prints(e.orig_tb)
            raise RuntimeError("Failed to run pdfinfo")
        info = res["result"]
        with open(res["stdout_stderr"], "rb") as f:
            raw = f.read().strip()
            if raw:
                prints(raw)
        if not info:
            raise ValueError("Could not read info dict from PDF")
        covpath = os.path.join(pdfpath, "cover.jpg")
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, "rb") as f:
                cdata = f.read()

    title = info.get("Title", None)
    au = info.get("Author", None)
    if au is None:
        au = [_("Unknown")]
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)
    # if isbn is not None:
    #    mi.isbn = isbn

    creator = info.get("Creator", None)
    if creator:
        mi.book_producer = creator

    keywords = info.get("Keywords", None)
    mi.tags = []
    if keywords:
        mi.tags = [x.strip() for x in keywords.split(",")]
        isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
        if isbn:
            mi.isbn = isbn = isbn[0]
        mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]

    subject = info.get("Subject", None)
    if subject:
        mi.tags.insert(0, subject)

    if cdata:
        mi.cover_data = ("jpeg", cdata)

    return mi
Example #20
0
def do_set_metadata(opts, mi, stream, stream_type):
    mi = MetaInformation(mi)
    for x in ('guide', 'toc', 'manifest', 'spine'):
        setattr(mi, x, None)

    from_opf = getattr(opts, 'from_opf', None)
    if from_opf is not None:
        from calibre.ebooks.metadata.opf2 import OPF
        opf_mi = OPF(open(from_opf, 'rb')).to_book_metadata()
        mi.smart_update(opf_mi)

    for pref in config().option_set.preferences:
        if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort',
                         'author_sort', 'get_cover', 'cover', 'tags',
                         'lrf_bookid', 'identifiers'):
            continue
        val = getattr(opts, pref.name, None)
        if val is not None:
            setattr(mi, pref.name, val)
    if getattr(opts, 'authors', None) is not None:
        mi.authors = string_to_authors(opts.authors)
        mi.author_sort = authors_to_sort_string(mi.authors)
    if getattr(opts, 'author_sort', None) is not None:
        mi.author_sort = opts.author_sort
    if getattr(opts, 'title_sort', None) is not None:
        mi.title_sort = opts.title_sort
    elif getattr(opts, 'title', None) is not None:
        mi.title_sort = title_sort(opts.title)
    if getattr(opts, 'tags', None) is not None:
        mi.tags = [t.strip() for t in opts.tags.split(',')]
    if getattr(opts, 'series', None) is not None:
        mi.series = opts.series.strip()
    if getattr(opts, 'series_index', None) is not None:
        mi.series_index = float(opts.series_index.strip())
    if getattr(opts, 'pubdate', None) is not None:
        mi.pubdate = parse_date(opts.pubdate, assume_utc=False, as_utc=False)
    if getattr(opts, 'identifiers', None):
        val = {
            k.strip(): v.strip()
            for k, v in (x.partition(':')[0::2] for x in opts.identifiers)
        }
        if val:
            orig = mi.get_identifiers()
            orig.update(val)
            val = {k: v for k, v in orig.iteritems() if k and v}
            mi.set_identifiers(val)

    if getattr(opts, 'cover', None) is not None:
        ext = os.path.splitext(opts.cover)[1].replace('.', '').upper()
        mi.cover_data = (ext, open(opts.cover, 'rb').read())

    with force_identifiers:
        set_metadata(stream, mi, stream_type)
Example #21
0
File: pdf.py Project: Eksmo/calibre
def get_metadata(stream, cover=True):
    with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
        stream.seek(0)
        with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
            shutil.copyfileobj(stream, f)
        try:
            res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
                    (pdfpath, bool(cover)))
        except WorkerError as e:
            prints(e.orig_tb)
            raise RuntimeError('Failed to run pdfinfo')
        info = res['result']
        with open(res['stdout_stderr'], 'rb') as f:
            raw = f.read().strip()
            if raw:
                prints(raw)
        if not info:
            raise ValueError('Could not read info dict from PDF')
        covpath = os.path.join(pdfpath, 'cover.jpg')
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, 'rb') as f:
                cdata = f.read()

    title = info.get('Title', None)
    au = info.get('Author', None)
    if au is None:
        au = [_('Unknown')]
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)
    #if isbn is not None:
    #    mi.isbn = isbn

    creator = info.get('Creator', None)
    if creator:
        mi.book_producer = creator

    keywords = info.get('Keywords', None)
    mi.tags = []
    if keywords:
        mi.tags = [x.strip() for x in keywords.split(',')]

    subject = info.get('Subject', None)
    if subject:
        mi.tags.insert(0, subject)

    if cdata:
        mi.cover_data = ('jpeg', cdata)

    return mi
Example #22
0
def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False):
    prefixes, refines = read_prefixes(root), read_refines(root)
    set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers)
    set_title(root, prefixes, refines, mi.title, mi.title_sort)
    set_languages(root, prefixes, refines, mi.languages)
    aus = string_to_authors(mi.author_sort or '')
    authors = []
    for i, aut in enumerate(mi.authors):
        authors.append(Author(aut, aus[i] if i < len(aus) else None))
    set_authors(root, prefixes, refines, authors)
    set_pubdate(root, prefixes, refines, mi.pubdate)
    set_timestamp(root, prefixes, refines, mi.timestamp)

    pretty_print_opf(root)
Example #23
0
def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)

    buf = BytesIO()
    log = create_log(buf)
    abort = Event()

    authors = []
    if opts.authors:
        authors = string_to_authors(opts.authors)

    identifiers = {}
    if opts.isbn:
        identifiers['isbn'] = opts.isbn

    results = identify(log, abort, title=opts.title, authors=authors,
            identifiers=identifiers, timeout=int(opts.timeout))

    if not results:
        print (log, file=sys.stderr)
        prints('No results found', file=sys.stderr)
        raise SystemExit(1)
    result = results[0]

    cf = None
    if opts.cover and results:
        cover = download_cover(log, title=opts.title, authors=authors,
                identifiers=result.identifiers, timeout=int(opts.timeout))
        if cover is None:
            prints('No cover found', file=sys.stderr)
        else:
            save_cover_data_to(cover[-1], opts.cover)
            result.cover = cf = opts.cover


    log = buf.getvalue()


    result = (metadata_to_opf(result) if opts.opf else
                    unicode(result).encode('utf-8'))

    if opts.verbose:
        print (log, file=sys.stderr)

    print (result)
    if not opts.opf and opts.cover:
        prints('Cover               :', cf)

    return 0
Example #24
0
def do_set_metadata(opts, mi, stream, stream_type):
    mi = MetaInformation(mi)
    for x in ('guide', 'toc', 'manifest', 'spine'):
        setattr(mi, x, None)

    from_opf = getattr(opts, 'from_opf', None)
    if from_opf is not None:
        from calibre.ebooks.metadata.opf2 import OPF
        opf_mi = OPF(open(from_opf, 'rb')).to_book_metadata()
        mi.smart_update(opf_mi)

    for pref in config().option_set.preferences:
        if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort',
                         'author_sort', 'get_cover', 'cover', 'tags',
                         'lrf_bookid', 'identifiers'):
            continue
        val = getattr(opts, pref.name, None)
        if val is not None:
            setattr(mi, pref.name, val)
    if getattr(opts, 'authors', None) is not None:
        mi.authors = string_to_authors(opts.authors)
        mi.author_sort = authors_to_sort_string(mi.authors)
    if getattr(opts, 'author_sort', None) is not None:
        mi.author_sort = opts.author_sort
    if getattr(opts, 'title_sort', None) is not None:
        mi.title_sort = opts.title_sort
    elif getattr(opts, 'title', None) is not None:
        mi.title_sort = title_sort(opts.title)
    if getattr(opts, 'tags', None) is not None:
        mi.tags = [t.strip() for t in opts.tags.split(',')]
    if getattr(opts, 'series', None) is not None:
        mi.series = opts.series.strip()
    if getattr(opts, 'series_index', None) is not None:
        mi.series_index = float(opts.series_index.strip())
    if getattr(opts, 'pubdate', None) is not None:
        mi.pubdate = parse_date(opts.pubdate, assume_utc=False, as_utc=False)
    if getattr(opts, 'identifiers', None):
        val = {k.strip():v.strip() for k, v in (x.partition(':')[0::2] for x in opts.identifiers)}
        if val:
            orig = mi.get_identifiers()
            orig.update(val)
            val = {k:v for k, v in iteritems(orig) if k and v}
            mi.set_identifiers(val)

    if getattr(opts, 'cover', None) is not None:
        ext = os.path.splitext(opts.cover)[1].replace('.', '').upper()
        mi.cover_data = (ext, open(opts.cover, 'rb').read())

    with force_identifiers:
        set_metadata(stream, mi, stream_type)
Example #25
0
File: lrx.py Project: AEliu/calibre
def get_metadata(f):
    read = lambda at, amount: _read(f, at, amount)
    f.seek(0)
    buf = f.read(12)
    if buf[4:] == 'ftypLRX2':
        offset = 0
        while True:
            offset += word_be(buf[:4])
            try:
                buf = read(offset, 8)
            except:
                raise ValueError('Not a valid LRX file')
            if buf[4:] == 'bbeb':
                break
        offset += 8
        buf = read(offset, 16)
        if buf[:8].decode('utf-16-le') != 'LRF\x00':
            raise ValueError('Not a valid LRX file')
        lrf_version = word_le(buf[8:12])
        offset += 0x4c
        compressed_size = short_le(read(offset, 2))
        offset += 2
        if lrf_version >= 800:
            offset += 6
        compressed_size -= 4
        uncompressed_size = word_le(read(offset, 4))
        info = decompress(f.read(compressed_size))
        if len(info) != uncompressed_size:
            raise ValueError('LRX file has malformed metadata section')
        root = etree.fromstring(info)
        bi = root.find('BookInfo')
        title = bi.find('Title')
        title_sort = title.get('reading', None)
        title = title.text
        author = bi.find('Author')
        author_sort = author.get('reading', None)
        mi = MetaInformation(title, string_to_authors(author.text))
        mi.title_sort, mi.author_sort = title_sort, author_sort
        author = author.text
        publisher = bi.find('Publisher')
        mi.publisher = getattr(publisher, 'text', None)
        mi.tags = [x.text for x in  bi.findall('Category')]
        mi.language = root.find('DocInfo').find('Language').text
        return mi

    elif buf[4:8] == 'LRX':
        raise ValueError('Librie LRX format not supported')
    else:
        raise ValueError('Not a LRX file')
Example #26
0
def get_metadata(f):
    read = lambda at, amount: _read(f, at, amount)
    f.seek(0)
    buf = f.read(12)
    if buf[4:] == b'ftypLRX2':
        offset = 0
        while True:
            offset += word_be(buf[:4])
            try:
                buf = read(offset, 8)
            except:
                raise ValueError('Not a valid LRX file')
            if buf[4:] == b'bbeb':
                break
        offset += 8
        buf = read(offset, 16)
        if buf[:8].decode('utf-16-le') != 'LRF\x00':
            raise ValueError('Not a valid LRX file')
        lrf_version = word_le(buf[8:12])
        offset += 0x4c
        compressed_size = short_le(read(offset, 2))
        offset += 2
        if lrf_version >= 800:
            offset += 6
        compressed_size -= 4
        uncompressed_size = word_le(read(offset, 4))
        info = decompress(f.read(compressed_size))
        if len(info) != uncompressed_size:
            raise ValueError('LRX file has malformed metadata section')
        root = safe_xml_fromstring(info)
        bi = root.find('BookInfo')
        title = bi.find('Title')
        title_sort = title.get('reading', None)
        title = title.text
        author = bi.find('Author')
        author_sort = author.get('reading', None)
        mi = MetaInformation(title, string_to_authors(author.text))
        mi.title_sort, mi.author_sort = title_sort, author_sort
        author = author.text
        publisher = bi.find('Publisher')
        mi.publisher = getattr(publisher, 'text', None)
        mi.tags = [x.text for x in bi.findall('Category')]
        mi.language = root.find('DocInfo').find('Language').text
        return mi

    elif buf[4:8] == b'LRX':
        raise ValueError('Librie LRX format not supported')
    else:
        raise ValueError('Not a LRX file')
Example #27
0
File: odt.py Project: Eksmo/calibre
def get_metadata(stream):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if data.has_key('title'):
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif data.has_key('creator'):
        mi.authors = string_to_authors(data['creator'])
    if data.has_key('description'):
        mi.comments = data['description']
    if data.has_key('language'):
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = data['keywords'].split(',')

    return mi
Example #28
0
 def add_author(self):
     text = self.author.text().strip()
     authors = OrderedDict((icu_lower(x), (i, x)) for i, x in enumerate(self.authors))
     if text:
         for author in string_to_authors(text):
             la = icu_lower(author)
             if la in authors and authors[la][1] != author:
                 # Case change
                 i = authors[la][0]
                 authors[la] = (i, author)
                 self.al.item(i).setText(author)
             else:
                 self.al.addItem(author)
                 authors[la] = author
     self.author.setText('')
Example #29
0
def get_metadata(stream):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if data.has_key('title'):
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif data.has_key('creator'):
        mi.authors = string_to_authors(data['creator'])
    if data.has_key('description'):
        mi.comments = data['description']
    if data.has_key('language'):
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = data['keywords'].split(',')

    return mi
Example #30
0
    def update_state(self, *args):
        au = unicode(self.authors_edit.text())
        au = re.sub(r'\s+et al\.$', '', au)
        au = self.db.author_sort_from_authors(string_to_authors(au))

        normal = strcmp(au, self.current_val) == 0
        if normal:
            col = 'rgb(0, 255, 0, 20%)'
        else:
            col = 'rgb(255, 0, 0, 20%)'
        self.setStyleSheet('QLineEdit { color: black; '
                              'background-color: %s; }'%col)
        tt = self.tooltips[0 if normal else 1]
        self.setToolTip(tt)
        self.setWhatsThis(tt)
Example #31
0
 def add_author(self):
     text = self.author.text().strip()
     authors = OrderedDict((icu_lower(x), (i, x)) for i, x in enumerate(self.authors))
     if text:
         for author in string_to_authors(text):
             la = icu_lower(author)
             if la in authors and authors[la][1] != author:
                 # Case change
                 i = authors[la][0]
                 authors[la] = (i, author)
                 self.al.item(i).setText(author)
             else:
                 self.al.addItem(author)
                 authors[la] = author
     self.author.setText('')
Example #32
0
    def update_state(self, *args):
        au = unicode(self.authors_edit.text())
        au = re.sub(r'\s+et al\.$', '', au)
        au = self.db.author_sort_from_authors(string_to_authors(au))

        normal = strcmp(au, self.current_val) == 0
        if normal:
            col = 'rgb(0, 255, 0, 20%)'
        else:
            col = 'rgb(255, 0, 0, 20%)'
        self.setStyleSheet('QLineEdit { color: black; '
                           'background-color: %s; }' % col)
        tt = self.tooltips[0 if normal else 1]
        self.setToolTip(tt)
        self.setWhatsThis(tt)
Example #33
0
 def cell_changed(self, row, col):
     if col == 0:
         item = self.table.item(row, 0)
         aut  = unicode(item.text()).strip()
         aut_list = string_to_authors(aut)
         if len(aut_list) != 1:
             error_dialog(self.parent(), _('Invalid author name'),
                     _('You cannot change an author to multiple authors.')).exec_()
             aut = ' % '.join(aut_list)
             self.table.item(row, 0).setText(aut)
         c = self.table.item(row, 1)
         c.setText(author_to_author_sort(aut))
         item = c
     else:
         item  = self.table.item(row, col)
     self.table.setCurrentItem(item)
     self.table.scrollToItem(item)
Example #34
0
 def cell_changed(self, row, col):
     if col == 0:
         item = self.table.item(row, 0)
         aut  = unicode_type(item.text()).strip()
         aut_list = string_to_authors(aut)
         if len(aut_list) != 1:
             error_dialog(self.parent(), _('Invalid author name'),
                     _('You cannot change an author to multiple authors.')).exec_()
             aut = ' % '.join(aut_list)
             self.table.item(row, 0).setText(aut)
         c = self.table.item(row, 1)
         c.setText(author_to_author_sort(aut))
         item = c
     else:
         item  = self.table.item(row, col)
     self.table.setCurrentItem(item)
     self.table.scrollToItem(item)
Example #35
0
def main(opts, args, dbctx):
    aut = string_to_authors(opts.authors) if opts.authors else []
    tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else []
    lcodes = [canonicalize_lang(x) for x in (opts.languages or '').split(',')]
    lcodes = [x for x in lcodes if x]
    identifiers = (x.partition(':')[::2] for x in opts.identifier)
    identifiers = dict((k.strip(), v.strip()) for k, v in identifiers
                       if k.strip() and v.strip())
    if opts.empty:
        do_add_empty(dbctx, opts.title, aut, opts.isbn, tags, opts.series,
                     opts.series_index, opts.cover, identifiers, lcodes)
        return 0
    if len(args) < 1:
        raise SystemExit(_('You must specify at least one file to add'))
    do_add(dbctx, args, opts.one_book_per_directory, opts.recurse,
           opts.duplicates, opts.title, aut, opts.isbn, tags, opts.series,
           opts.series_index, opts.cover, identifiers, lcodes, opts.filters)
    return 0
Example #36
0
File: meta.py Project: Farb/calibre
def get_metadata(stream):
    """
    Return basic meta-data about the LRF file in C{stream} as a
    L{MetaInformation} object.
    @param stream: A file like object or an instance of L{LRFMetaFile}
    """
    lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream)
    authors = string_to_authors(lrf.author)
    mi = MetaInformation(lrf.title.strip(), authors)
    mi.author = lrf.author.strip()
    mi.comments = lrf.free_text.strip()
    mi.category = lrf.category.strip()+', '+lrf.classification.strip()
    tags = [x.strip() for x in mi.category.split(',') if x.strip()]
    if tags:
        mi.tags = tags
    if mi.category.strip() == ',':
        mi.category = None
    mi.publisher = lrf.publisher.strip()
    mi.cover_data = lrf.get_cover()
    try:
        mi.title_sort = lrf.title_reading.strip()
        if not mi.title_sort:
            mi.title_sort = None
    except:
        pass
    try:
        mi.author_sort = lrf.author_reading.strip()
        if not mi.author_sort:
            mi.author_sort = None
    except:
        pass
    if not mi.title or 'unknown' in mi.title.lower():
        mi.title = None
    if not mi.authors:
        mi.authors = None
    if not mi.author or 'unknown' in mi.author.lower():
        mi.author = None
    if not mi.category or 'unknown' in mi.category.lower():
        mi.category = None
    if not mi.publisher or 'unknown' in mi.publisher.lower() or \
            'some publisher' in mi.publisher.lower():
        mi.publisher = None

    return mi
Example #37
0
def get_metadata(stream):
    """
    Return basic meta-data about the LRF file in C{stream} as a
    L{MetaInformation} object.
    @param stream: A file like object or an instance of L{LRFMetaFile}
    """
    lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream)
    authors = string_to_authors(lrf.author)
    mi = MetaInformation(lrf.title.strip(), authors)
    mi.author = lrf.author.strip()
    mi.comments = lrf.free_text.strip()
    mi.category = lrf.category.strip() + ', ' + lrf.classification.strip()
    tags = [x.strip() for x in mi.category.split(',') if x.strip()]
    if tags:
        mi.tags = tags
    if mi.category.strip() == ',':
        mi.category = None
    mi.publisher = lrf.publisher.strip()
    mi.cover_data = lrf.get_cover()
    try:
        mi.title_sort = lrf.title_reading.strip()
        if not mi.title_sort:
            mi.title_sort = None
    except:
        pass
    try:
        mi.author_sort = lrf.author_reading.strip()
        if not mi.author_sort:
            mi.author_sort = None
    except:
        pass
    if not mi.title or 'unknown' in mi.title.lower():
        mi.title = None
    if not mi.authors:
        mi.authors = None
    if not mi.author or 'unknown' in mi.author.lower():
        mi.author = None
    if not mi.category or 'unknown' in mi.category.lower():
        mi.category = None
    if not mi.publisher or 'unknown' in mi.publisher.lower() or \
            'some publisher' in mi.publisher.lower():
        mi.publisher = None

    return mi
Example #38
0
    def metadata_from_path(cls, path):
        def check_unicode(txt):
            if not isinstance(txt, unicode_type):
                txt = txt.decode(filesystem_encoding, 'replace')
            txt = txt.replace('_', ' ')
            return txt

        mi = cls.metadata_from_formats([path])

        if (mi.title==_('Unknown') or mi.authors==[_('Unknown')]) \
                and '#' in mi.title:
            fn = os.path.splitext(os.path.basename(path))[0]
            match = cls.JETBOOK_FILE_NAME_PATTERN.match(fn)
            if match is not None:
                mi.title = check_unicode(match.group('title'))
                authors = string_to_authors(match.group('authors'))
                mi.authors = list(map(check_unicode, authors))

        return mi
Example #39
0
def get_metadata(stream):
    """
    Return metadata as a L{MetaInfo} object
    """
    stream.seek(0)
    if stream.read(5) != r'{\rtf':
        return MetaInformation(_('Unknown'))
    block = get_document_info(stream)[0]
    if not block:
        return MetaInformation(_('Unknown'))

    stream.seek(0)
    cpg = detect_codepage(stream)
    stream.seek(0)

    title_match = title_pat.search(block)
    if title_match is not None:
        title = decode(title_match.group(1).strip(), cpg)
    else:
        title = _('Unknown')
    author_match = author_pat.search(block)
    if author_match is not None:
        author = decode(author_match.group(1).strip(), cpg)
    else:
        author = None
    mi = MetaInformation(title)
    if author:
        mi.authors = string_to_authors(author)

    comment_match = comment_pat.search(block)
    if comment_match is not None:
        comment = decode(comment_match.group(1).strip(), cpg)
        mi.comments = comment
    tags_match = tags_pat.search(block)
    if tags_match is not None:
        tags = decode(tags_match.group(1).strip(), cpg)
        mi.tags = list([_f for _f in (x.strip() for x in tags.split(',')) if _f])
    publisher_match = publisher_pat.search(block)
    if publisher_match is not None:
        publisher = decode(publisher_match.group(1).strip(), cpg)
        mi.publisher = publisher

    return mi
Example #40
0
def command_add(args, dbpath):
    from calibre.ebooks.metadata import string_to_authors
    parser = add_option_parser()
    opts, args = parser.parse_args(sys.argv[:1] + args)
    aut = string_to_authors(opts.authors) if opts.authors else []
    tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else []
    if opts.empty:
        do_add_empty(get_db(dbpath, opts), opts.title, aut, opts.isbn, tags,
                opts.series, opts.series_index)
        return 0
    if len(args) < 2:
        parser.print_help()
        print
        print >>sys.stderr, _('You must specify at least one file to add')
        return 1
    do_add(get_db(dbpath, opts), args[1:], opts.one_book_per_directory,
            opts.recurse, opts.duplicates, opts.title, aut, opts.isbn,
            tags, opts.series, opts.series_index)
    return 0
Example #41
0
def command_add(args, dbpath):
    from calibre.ebooks.metadata import string_to_authors
    parser = add_option_parser()
    opts, args = parser.parse_args(sys.argv[:1] + args)
    aut = string_to_authors(opts.authors) if opts.authors else []
    tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else []
    if opts.empty:
        do_add_empty(get_db(dbpath, opts), opts.title, aut, opts.isbn, tags,
                opts.series, opts.series_index)
        return 0
    if len(args) < 2:
        parser.print_help()
        print
        print >>sys.stderr, _('You must specify at least one file to add')
        return 1
    do_add(get_db(dbpath, opts), args[1:], opts.one_book_per_directory,
            opts.recurse, opts.duplicates, opts.title, aut, opts.isbn,
            tags, opts.series, opts.series_index)
    return 0
Example #42
0
def get_metadata(stream):
    """
    Return metadata as a L{MetaInfo} object
    """
    stream.seek(0)
    if stream.read(5) != r'{\rtf':
        return MetaInformation(_('Unknown'))
    block = get_document_info(stream)[0]
    if not block:
        return MetaInformation(_('Unknown'))

    stream.seek(0)
    cpg = detect_codepage(stream)
    stream.seek(0)

    title_match = title_pat.search(block)
    if title_match is not None:
        title = decode(title_match.group(1).strip(), cpg)
    else:
        title = _('Unknown')
    author_match = author_pat.search(block)
    if author_match is not None:
        author = decode(author_match.group(1).strip(), cpg)
    else:
        author = None
    mi = MetaInformation(title)
    if author:
        mi.authors = string_to_authors(author)

    comment_match = comment_pat.search(block)
    if comment_match is not None:
        comment = decode(comment_match.group(1).strip(), cpg)
        mi.comments = comment
    tags_match = tags_pat.search(block)
    if tags_match is not None:
        tags = decode(tags_match.group(1).strip(), cpg)
        mi.tags = list(filter(None, (x.strip() for x in tags.split(','))))
    publisher_match = publisher_pat.search(block)
    if publisher_match is not None:
        publisher = decode(publisher_match.group(1).strip(), cpg)
        mi.publisher = publisher

    return mi
Example #43
0
def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
    title = 'Unknown'
    mi = MetaInformation(title, ['Unknown'])
    stream.seek(0)
    try:
        if not stream.read(14) == MAGIC:
            print >> sys.stderr, u'Couldn\'t read RB header from file'
            return mi
        stream.read(10)

        read_i32 = lambda: struct.unpack('<I', stream.read(4))[0]

        stream.seek(read_i32())
        toc_count = read_i32()

        for i in range(toc_count):
            stream.read(32)
            length, offset, flag = read_i32(), read_i32(), read_i32()
            if flag == 2:
                break
        else:
            print >> sys.stderr, u'Couldn\'t find INFO from RB file'
            return mi

        stream.seek(offset)
        info = stream.read(length).splitlines()
        for line in info:
            if '=' not in line:
                continue
            key, value = line.split('=')
            if key.strip() == 'TITLE':
                mi.title = value.strip()
            elif key.strip() == 'AUTHOR':
                mi.author = value
                mi.authors = string_to_authors(value)
    except Exception as err:
        msg = u'Couldn\'t read metadata from rb: %s with error %s' % (
            mi.title, unicode(err))
        print >> sys.stderr, msg.encode('utf8')
        raise
    return mi
Example #44
0
    def metadata_from_path(cls, path):

        def check_unicode(txt):
            if not isinstance(txt, unicode_type):
                txt = txt.decode(filesystem_encoding, 'replace')
            txt = txt.replace('_', ' ')
            return txt

        mi = cls.metadata_from_formats([path])

        if (mi.title==_('Unknown') or mi.authors==[_('Unknown')]) \
                and '#' in mi.title:
            fn = os.path.splitext(os.path.basename(path))[0]
            match = cls.JETBOOK_FILE_NAME_PATTERN.match(fn)
            if match is not None:
                mi.title = check_unicode(match.group('title'))
                authors = string_to_authors(match.group('authors'))
                mi.authors = list(map(check_unicode, authors))

        return mi
Example #45
0
def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
    title = 'Unknown'
    mi = MetaInformation(title, ['Unknown'])
    stream.seek(0)
    try:
        if not stream.read(14) == MAGIC:
            print(u'Couldn\'t read RB header from file', file=sys.stderr)
            return mi
        stream.read(10)

        read_i32 = lambda: struct.unpack('<I', stream.read(4))[0]

        stream.seek(read_i32())
        toc_count = read_i32()

        for i in range(toc_count):
            stream.read(32)
            length, offset, flag = read_i32(), read_i32(), read_i32()
            if flag == 2:
                break
        else:
            print(u'Couldn\'t find INFO from RB file', file=sys.stderr)
            return mi

        stream.seek(offset)
        info = stream.read(length).splitlines()
        for line in info:
            if '=' not in line:
                continue
            key, value = line.split('=')
            if key.strip() == 'TITLE':
                mi.title = value.strip()
            elif key.strip() == 'AUTHOR':
                mi.author = value
                mi.authors = string_to_authors(value)
    except Exception as err:
        msg = u'Couldn\'t read metadata from rb: %s with error %s'%(mi.title, unicode(err))
        print(msg.encode('utf8'), file=sys.stderr)
        raise
    return mi
def read_doc_props(raw, mi, XPath):
    root = fromstring(raw)
    titles = XPath('//dc:title')(root)
    if titles:
        title = titles[0].text
        if title and title.strip():
            mi.title = title.strip()
    tags = []
    for subject in XPath('//dc:subject')(root):
        if subject.text and subject.text.strip():
            tags.append(subject.text.strip().replace(',', '_'))
    for keywords in XPath('//cp:keywords')(root):
        if keywords.text and keywords.text.strip():
            for x in keywords.text.split():
                tags.extend(y.strip() for y in x.split(',') if y.strip())
    if tags:
        mi.tags = tags
    authors = XPath('//dc:creator')(root)
    aut = []
    for author in authors:
        if author.text and author.text.strip():
            aut.extend(string_to_authors(author.text))
    if aut:
        mi.authors = aut
        mi.author_sort = authors_to_sort_string(aut)

    desc = XPath('//dc:description')(root)
    if desc:
        raw = etree.tostring(desc[0], method='text', encoding=unicode_type)
        raw = raw.replace('_x000d_',
                          '')  # Word 2007 mangles newlines in the summary
        mi.comments = raw.strip()

    langs = []
    for lang in XPath('//dc:language')(root):
        if lang.text and lang.text.strip():
            l = canonicalize_lang(lang.text)
            if l:
                langs.append(l)
    if langs:
        mi.languages = langs
Example #47
0
def get_metadata_quick(raw):
    p = podofo.PDFDoc()
    p.load(raw)
    title = p.title
    if not title:
        title = '_'
    author = p.author
    authors = string_to_authors(author) if author else  [_('Unknown')]
    creator = p.creator
    try:
        tags = [x.strip() for x in p.keywords.split(u',')]
        tags = [x for x in tags if x]
    except:
        tags = []

    mi = MetaInformation(title, authors)
    if creator:
        mi.book_producer = creator
    if tags:
        mi.tags = tags
    return mi
Example #48
0
def get_metadata_quick(raw):
    p = podofo.PDFDoc()
    p.load(raw)
    title = p.title
    if not title:
        title = '_'
    author = p.author
    authors = string_to_authors(author) if author else [_('Unknown')]
    creator = p.creator
    try:
        tags = [x.strip() for x in p.keywords.split(u',')]
        tags = [x for x in tags if x]
    except:
        tags = []

    mi = MetaInformation(title, authors)
    if creator:
        mi.book_producer = creator
    if tags:
        mi.tags = tags
    return mi
Example #49
0
def read_doc_props(raw, mi):
    root = fromstring(raw)
    titles = XPath('//dc:title')(root)
    if titles:
        title = titles[0].text
        if title and title.strip():
            mi.title = title.strip()
    tags = []
    for subject in XPath('//dc:subject')(root):
        if subject.text and subject.text.strip():
            tags.append(subject.text.strip().replace(',', '_'))
    for keywords in XPath('//cp:keywords')(root):
        if keywords.text and keywords.text.strip():
            for x in keywords.text.split():
                tags.extend(y.strip() for y in x.split(',') if y.strip())
    if tags:
        mi.tags = tags
    authors = XPath('//dc:creator')(root)
    aut = []
    for author in authors:
        if author.text and author.text.strip():
            aut.extend(string_to_authors(author.text))
    if aut:
        mi.authors = aut
        mi.author_sort = authors_to_sort_string(aut)

    desc = XPath('//dc:description')(root)
    if desc:
        raw = etree.tostring(desc[0], method='text', encoding=unicode)
        raw = raw.replace('_x000d_', '')  # Word 2007 mangles newlines in the summary
        mi.comments = raw.strip()

    langs = []
    for lang in XPath('//dc:language')(root):
        if lang.text and lang.text.strip():
            l = canonicalize_lang(lang.text)
            if l:
                langs.append(l)
    if langs:
        mi.languages = langs
Example #50
0
def main(opts, args, dbctx):
    aut = string_to_authors(opts.authors) if opts.authors else []
    tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else []
    lcodes = [canonicalize_lang(x) for x in (opts.languages or '').split(',')]
    lcodes = [x for x in lcodes if x]
    identifiers = (x.partition(':')[::2] for x in opts.identifier)
    identifiers = dict((k.strip(), v.strip()) for k, v in identifiers
                       if k.strip() and v.strip())
    if opts.empty:
        do_add_empty(
            dbctx, opts.title, aut, opts.isbn, tags, opts.series, opts.series_index,
            opts.cover, identifiers, lcodes
        )
        return 0
    if len(args) < 1:
        raise SystemExit(_('You must specify at least one file to add'))
    do_add(
        dbctx, args, opts.one_book_per_directory, opts.recurse, opts.duplicates,
        opts.title, aut, opts.isbn, tags, opts.series, opts.series_index, opts.cover,
        identifiers, lcodes, opts.filters
    )
    return 0
Example #51
0
 def opts_to_mi(self, mi):
     from calibre.ebooks.metadata import string_to_authors
     for x in self.metadata_option_names:
         val = getattr(self.opts, x, None)
         if val is not None:
             if x == 'authors':
                 val = string_to_authors(val)
             elif x == 'tags':
                 val = [i.strip() for i in val.split(',')]
             elif x in ('rating', 'series_index'):
                 try:
                     val = float(val)
                 except ValueError:
                     self.log.warn(_('Values of series index and rating must'
                     ' be numbers. Ignoring'), val)
                     continue
             elif x in ('timestamp', 'pubdate'):
                 try:
                     val = parse_date(val, assume_utc=x=='pubdate')
                 except:
                     self.log.exception(_('Failed to parse date/time') + ' ' +
                             unicode(val))
                     continue
             setattr(mi, x, val)
Example #52
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:

                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)

                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Example #53
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:
                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)
                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Example #54
0
    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
        import uuid
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import (DirContainer, rewrite_links,
                                             urlnormalize, urldefrag,
                                             BINARY_MIME, OEB_STYLES, xpath,
                                             urlquote)
        from calibre import guess_type
        from calibre.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        from calibre.ebooks.html.input import get_filelist
        from calibre.ebooks.metadata import string_to_authors
        from calibre.utils.localization import canonicalize_lang
        import css_parser, logging
        css_parser.log.setLevel(logging.WARN)
        self.OEB_STYLES = OEB_STYLES
        oeb = create_oebbook(log,
                             None,
                             opts,
                             self,
                             encoding=opts.input_encoding,
                             populate=False)
        self.oeb = oeb

        metadata = oeb.metadata
        meta_info_to_oeb_metadata(mi, metadata, log)
        if not metadata.language:
            l = canonicalize_lang(getattr(opts, 'language', None))
            if not l:
                oeb.logger.warn('Language not specified')
                l = get_lang().replace('_', '-')
            metadata.add('language', l)
        if not metadata.creator:
            a = getattr(opts, 'authors', None)
            if a:
                a = string_to_authors(a)
            if not a:
                oeb.logger.warn('Creator not specified')
                a = [self.oeb.translate(__('Unknown'))]
            for aut in a:
                metadata.add('creator', aut)
        if not metadata.title:
            oeb.logger.warn('Title not specified')
            metadata.add('title', self.oeb.translate(__('Unknown')))
        bookid = unicode_type(uuid.uuid4())
        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
        for ident in metadata.identifier:
            if 'id' in ident.attrib:
                self.oeb.uid = metadata.identifier[0]
                break

        filelist = get_filelist(htmlpath, basedir, opts, log)
        filelist = [f for f in filelist if not f.is_binary]
        htmlfile_map = {}
        for f in filelist:
            path = f.path
            oeb.container = DirContainer(os.path.dirname(path),
                                         log,
                                         ignore_opf=True)
            bname = os.path.basename(path)
            id, href = oeb.manifest.generate(id='html',
                                             href=sanitize_file_name(bname))
            htmlfile_map[path] = href
            item = oeb.manifest.add(id, href, 'text/html')
            if path == htmlpath and '%' in path:
                bname = urlquote(bname)
            item.html_input_href = bname
            oeb.spine.add(item, True)

        self.added_resources = {}
        self.log = log
        self.log('Normalizing filename cases')
        for path, href in htmlfile_map.items():
            if not self.is_case_sensitive(path):
                path = path.lower()
            self.added_resources[path] = href
        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
        self.urldefrag = urldefrag
        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME

        self.log('Rewriting HTML links')
        for f in filelist:
            path = f.path
            dpath = os.path.dirname(path)
            oeb.container = DirContainer(dpath, log, ignore_opf=True)
            href = htmlfile_map[path]
            try:
                item = oeb.manifest.hrefs[href]
            except KeyError:
                item = oeb.manifest.hrefs[urlnormalize(href)]
            rewrite_links(item.data, partial(self.resource_adder, base=dpath))

        for item in oeb.manifest.values():
            if item.media_type in self.OEB_STYLES:
                dpath = None
                for path, href in self.added_resources.items():
                    if href == item.href:
                        dpath = os.path.dirname(path)
                        break
                css_parser.replaceUrls(
                    item.data, partial(self.resource_adder, base=dpath))

        toc = self.oeb.toc
        self.oeb.auto_generated_toc = True
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = re.sub(r'\s+', ' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = re.sub(r'\s+', ' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)

        oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
        return oeb
Example #55
0
 def authors_from_string(self, raw):
     from calibre.ebooks.metadata import string_to_authors
     self.authors = string_to_authors(raw)
Example #56
0
 def auto_generate(self, *args):
     au = unicode(self.authors_edit.text())
     au = re.sub(r'\s+et al\.$', '', au).strip()
     authors = string_to_authors(au)
     self.current_val = self.db.author_sort_from_authors(authors)
Example #57
0
 def fget(self):
     au = unicode(self.text()).strip()
     if not au:
         au = self.get_default()
     return string_to_authors(au)
Example #58
0
 def deduce_author_sort(self, *args):
     au = str(self.author.currentText())
     au = re.sub(r'\s+et al\.$', '', au)
     authors = string_to_authors(au)
     self.author_sort.setText(self.db.author_sort_from_authors(authors))
Example #59
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags = parse_comment_tags(src)
    meta_tags = parse_meta_tags(src)

    def get(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = ans.strip()
        if not ans:
            ans = None
        return ans

    # Title
    title = get('title')
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = replace_entities(match.group(1))

    # Author
    authors = get('authors') or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title or _('Unknown'), string_to_authors(authors))

    for field in ('publisher', 'isbn', 'language', 'comments'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get('tags')
    if tags:
        tags = [x.strip() for x in tags.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    return mi