Beispiel #1
0
    def get_metadata(self, md, select):
        book = None
        if md.isbn:
            book = self.get_book_by_isbn(md.isbn)
        if not book:
            book = self.get_book_by_title(md.title, md.author_sort, select)
        if not book:
            return None
        mi = Metadata(book['title'])
        mi.authors = book['author']
        mi.author_sort = mi.authors[0] if mi.authors else None
        if mi.author_sort:
            for r in REMOVES:
                mi.author_sort = r.sub("", mi.author_sort)
            mi.authors[0] = mi.author_sort
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)

        img_url = book['images']['large']
        img_fmt = img_url.split(".")[-1]
        img = StringIO(urlopen(img_url).read())
        mi.cover_data = (img_fmt, img)
        #logging.error("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #2
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join( "%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors   = [ info.get(u'作者', u'佚名') ]
        mi.author_sort = mi.authors[0]
        mi.isbn      = BAIKE_ISBN
        mi.tags      = baike.get_tags()
        mi.pubdate   = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments  = re.sub(r'\[\d+\]$', "", baike.get_summary() )
        mi.website   = baike.http.url
        mi.source    = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except: pass
        return mi
Beispiel #3
0
    def _metadata(self, book):
        authors = []
        if book['author']:
            for author in book['author']:
                for r in REMOVES:
                    author = r.sub("", author)
                authors.append(author)
        if not authors: authors = [u'佚名']

        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO
        mi = Metadata(book['title'])
        mi.authors = authors
        mi.author_sort = mi.authors[0]
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)
        mi.website = "https://book.douban.com/isbn/%s" % mi.isbn
        mi.source = u'豆瓣'

        mi.cover_url = book['images']['large']
        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        logging.debug("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #4
0
    def get_metadata(self, md):
        book = None
        if md.isbn:
            book = self.get_book_by_isbn(md.isbn)
        if not book:
            book = self.get_book_by_title(md.title)
        mi = Metadata(book['title'])
        mi.authors     = book['author']
        mi.author_sort = mi.authors[0] if mi.authors else None
        if mi.author_sort:
            for r in REMOVES:
                mi.author_sort = r.sub("", mi.author_sort)
            mi.authors[0] = mi.author_sort
        mi.publisher   = book['publisher']
        mi.comments    = book['summary']
        mi.isbn        = book.get('isbn13', None)
        mi.tags        = [ t['name'] for t in book['tags'] ][:8]
        mi.rating      = int(float(book['rating']['average']))
        mi.pubdate     = self.str2date(book['pubdate'])
        mi.timestamp   = datetime.datetime.now()
        mi.douban_id   = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)

        img_url = book['images']['large']
        img_fmt = img_url.split(".")[-1]
        img = StringIO(urlopen(img_url).read())
        mi.cover_data = (img_fmt, img)
        logging.error("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #5
0
def read_metadata_kfx(stream, read_cover=True):
    ' Read the metadata.kfx file that is found in the sdr book folder for KFX files '
    c = Container(stream.read())
    m = extract_metadata(c.decode())

    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ''
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get('title') or _('Unknown')
    authors = get('authors', False) or [_('Unknown')]
    auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$')

    def fix_author(x):
        if tweaks['author_sort_copy_method'] != 'copy':
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + ' ' + m.group(1)
        return x

    mi = Metadata(title, [fix_author(x) for x in authors])
    if has('author'):
        mi.author_sort = get('author')
    if has('ASIN'):
        mi.set_identifier('mobi-asin', get('ASIN'))
    elif has('content_id'):
        mi.set_identifier('mobi-asin', get('content_id'))
    if has('languages'):
        langs = list(
            filter(None,
                   (canonicalize_lang(x) for x in get('languages', False))))
        if langs:
            mi.languages = langs
    if has('issue_date'):
        try:
            mi.pubdate = parse_only_date(get('issue_date'))
        except Exception:
            pass
    if has('publisher') and get('publisher') != 'Unknown':
        mi.publisher = get('publisher')
    if read_cover and m[COVER_KEY]:
        try:
            data = base64.standard_b64decode(m[COVER_KEY])
            fmt, w, h = identify(bytes(data))
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w > -1 and h > -1:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #6
0
 def get_metadata(stream: IO, f_type: str) -> Metadata:
     assert f_type == "umd"
     book = UMDFile.from_stream(stream)
     metadata = Metadata(title=book.title, authors=[book.author])
     metadata.publisher = book.publisher
     metadata.pubdate = book.publish_date
     if book.cover:
         metadata.cover_data = ('jpeg', book.cover)
     return metadata
Beispiel #7
0
def read_metadata_kfx(stream, read_cover=True):
    ' Read the metadata.kfx file that is found in the sdr book folder for KFX files '
    c = Container(stream.read())
    m = extract_metadata(c.decode())
    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ''
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get('title') or _('Unknown')
    authors = get('authors', False) or [_('Unknown')]
    auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$')

    def fix_author(x):
        if tweaks['author_sort_copy_method'] != 'copy':
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + ' ' + m.group(1)
        return x

    mi = Metadata(title, [fix_author(x) for x in authors])
    if has('author'):
        mi.author_sort = get('author')
    if has('ASIN'):
        mi.set_identifier('mobi-asin', get('ASIN'))
    elif has('content_id'):
        mi.set_identifier('mobi-asin', get('content_id'))
    if has('languages'):
        langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False))))
        if langs:
            mi.languages = langs
    if has('issue_date'):
        try:
            mi.pubdate = parse_only_date(get('issue_date'))
        except Exception:
            pass
    if has('publisher') and get('publisher') != 'Unknown':
        mi.publisher = get('publisher')
    if read_cover and m[COVER_KEY]:
        try:
            data = base64.standard_b64decode(m[COVER_KEY])
            w, h, fmt = identify_data(data)
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w and h:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #8
0
def read_metadata_kfx(stream, read_cover=True):
    " Read the metadata.kfx file that is found in the sdr book folder for KFX files "
    c = Container(stream.read())
    m = extract_metadata(c.decode())
    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ""
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get("title") or _("Unknown")
    authors = get("authors", False) or [_("Unknown")]
    auth_pat = re.compile(r"([^,]+?)\s*,\s+([^,]+)$")

    def fix_author(x):
        if tweaks["author_sort_copy_method"] != "copy":
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + " " + m.group(1)
        return x

    mi = Metadata(title, [fix_author(x) for x in authors])
    if has("author"):
        mi.author_sort = get("author")
    if has("ASIN"):
        mi.set_identifier("mobi-asin", get("ASIN"))
    elif has("content_id"):
        mi.set_identifier("mobi-asin", get("content_id"))
    if has("languages"):
        langs = list(filter(None, (canonicalize_lang(x) for x in get("languages", False))))
        if langs:
            mi.languages = langs
    if has("issue_date"):
        try:
            mi.pubdate = parse_only_date(get("issue_date"))
        except Exception:
            pass
    if has("publisher") and get("publisher") != "Unknown":
        mi.publisher = get("publisher")
    if read_cover and m[COVER_KEY]:
        try:
            data = base64.standard_b64decode(m[COVER_KEY])
            fmt, w, h = identify(bytes(data))
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w > -1 and h > -1:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #9
0
 def get_metadata(book_id):
     oldmi = db.get_metadata(book_id, index_is_id=True, get_cover=True, cover_as_data=True)
     opf, cov = id_map[book_id]
     if opf is None:
         newmi = Metadata(oldmi.title, authors=tuple(oldmi.authors))
     else:
         with open(opf, 'rb') as f:
             newmi = OPF(f, basedir=os.path.dirname(opf), populate_spine=False).to_book_metadata()
             newmi.cover, newmi.cover_data = None, (None, None)
             for x in ('title', 'authors'):
                 if newmi.is_null(x):
                     # Title and author are set to null if they are
                     # the same as the originals as an optimization,
                     # we undo that, as it is confusing.
                     newmi.set(x, copy.copy(oldmi.get(x)))
     if cov:
         with open(cov, 'rb') as f:
             newmi.cover_data = ('jpg', f.read())
     return oldmi, newmi
Beispiel #10
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata

        info = baike.get_info()
        logging.debug("\n".join("%s:\t%s" % v for v in info.items()))

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'
        mi.provider_key = KEY
        mi.provider_value = baike.get_id()

        if self.copy_image and mi.cover_url:
            logging.debug("fetching cover: %s", mi.cover_url)
            img = io.BytesIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi
Beispiel #11
0
def get_metadata(stream):
    with ZipFile(stream, 'r') as zf:

        mi = Metadata(_('Unknown'))
        cdata = None

        for zi in zf.infolist():
            ext = zi.filename.rpartition('.')[-1].lower()
            if zi.filename.lower() == 'docprops/core.xml':
                _read_doc_props(zf.read(zi), mi)
            elif zi.filename.lower() == 'docprops/app.xml':
                _read_app_props(zf.read(zi), mi)
            elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
                raw = zf.read(zi)
                try:
                    width, height, fmt = identify_data(raw)
                except:
                    continue
                if 0.8 <= height / width <= 1.8 and height * width >= 12000:
                    cdata = (fmt, raw)
        if cdata is not None:
            mi.cover_data = cdata

    return mi
Beispiel #12
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join("%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "網絡小說平台"
        plat = info.get(u'首發狀態', plat)
        plat = info.get(u'首發網站', plat)
        plat = plat.replace(u'首發', '')
        mi.publisher = info.get(u'連載平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完結' in info.get(u'連載狀態', ""):
            day = re.findall('\d*-\d*-\d*', info[u'連載狀態'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi
Beispiel #13
0
def get_metadata(stream):
    with ZipFile(stream, 'r') as zf:

        mi = Metadata(_('Unknown'))
        cdata = None

        for zi in zf.infolist():
            ext = zi.filename.rpartition('.')[-1].lower()
            if zi.filename.lower() == 'docprops/core.xml':
                _read_doc_props(zf.read(zi), mi)
            elif zi.filename.lower() == 'docprops/app.xml':
                _read_app_props(zf.read(zi), mi)
            elif cdata is None and ext in {'jpeg', 'jpg', 'png', 'gif'}:
                raw = zf.read(zi)
                try:
                    width, height, fmt = identify_data(raw)
                except:
                    continue
                if 0.8 <= height/width <= 1.8 and height*width >= 12000:
                    cdata = (fmt, raw)
        if cdata is not None:
            mi.cover_data = cdata

    return mi