def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [ info.get(u'作者', u'佚名') ] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary() ) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def opdsToMetadata(self, opdsBookStructure): authors = opdsBookStructure.author.replace(u"& ", u"&") metadata = Metadata(opdsBookStructure.title, authors.split(u"&")) metadata.uuid = opdsBookStructure.id.replace("urn:uuid:", "", 1) rawTimestamp = opdsBookStructure.updated parsableTimestamp = re.sub(r"((\.[0-9]+)?\+00:00|Z)$", "", rawTimestamp) metadata.timestamp = datetime.datetime.strptime( parsableTimestamp, "%Y-%m-%dT%H:%M:%S") tags = [] summary = opdsBookStructure.get(u"summary", u"") summarylines = summary.splitlines() for summaryline in summarylines: if summaryline.startswith(u"TAGS: "): tagsline = summaryline.replace(u"TAGS: ", u"") tagsline = tagsline.replace(u"<br />", u"") tagsline = tagsline.replace(u", ", u",") tags = tagsline.split(u",") metadata.tags = tags bookDownloadUrls = [] links = opdsBookStructure.get("links", []) for link in links: url = link.get("href", "") bookType = link.get("type", "") # Skip covers and thumbnails if not bookType.startswith("image/"): if bookType == "application/x-mobipocket-ebook": # azw3 books are preferred and always put at the head of the list if found bookDownloadUrls.insert(0, url) else: # Other formats are appended as they are found bookDownloadUrls.append(url) metadata.links = bookDownloadUrls return metadata
def _metadata(self, book): authors = [] if book['author']: for author in book['author']: for r in REMOVES: author = r.sub("", author) authors.append(author) if not authors: authors = [u'佚名'] from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO mi = Metadata(book['title']) mi.authors = authors mi.author_sort = mi.authors[0] mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) mi.website = "https://book.douban.com/isbn/%s" % mi.isbn mi.source = u'豆瓣' mi.cover_url = book['images']['large'] if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) logging.debug("=================\ndouban metadata:\n%s" % mi) return mi
def get_metadata(self, md, select): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title, md.author_sort, select) if not book: return None mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) #logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def opdsToMetadata(self, opdsBookStructure): authors = opdsBookStructure.author.replace(u'& ', u'&') metadata = Metadata(opdsBookStructure.title, authors.split(u'&')) metadata.uuid = opdsBookStructure.id.replace('urn:uuid:', '', 1) rawTimestamp = opdsBookStructure.updated parsableTimestamp = re.sub('((\.[0-9]+)?\+00:00|Z)$', '', rawTimestamp) metadata.timestamp = datetime.datetime.strptime(parsableTimestamp, '%Y-%m-%dT%H:%M:%S') tags = [] summary = opdsBookStructure.get(u'summary', u'') summarylines = summary.splitlines() for summaryline in summarylines: if summaryline.startswith(u'TAGS: '): tagsline = summaryline.replace(u'TAGS: ', u'') tagsline = tagsline.replace(u'<br />',u'') tagsline = tagsline.replace(u', ', u',') tags = tagsline.split(u',') metadata.tags = tags bookDownloadUrls = [] links = opdsBookStructure.get('links', []) for link in links: url = link.get('href', '') bookType = link.get('type', '') # Skip covers and thumbnails if not bookType.startswith('image/'): if bookType == 'application/epub+zip': # EPUB books are preferred and always put at the head of the list if found bookDownloadUrls.insert(0, url) else: # Formats other than EPUB (eg. AZW), are appended as they are found bookDownloadUrls.append(url) metadata.links = bookDownloadUrls return metadata
def get_metadata(self, md): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title) mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [ t['name'] for t in book['tags'] ][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def read_metadata(root): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key != 'uuid': ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm return ans
def get_baike_metadata(self, title): from baidubaike import Page try: baike = Page(title) except: return None info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = info.get(u'首发网站', None) if not plat: plat = info.get(u'首发状态', "网络小说平台") plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info[u'作者']] mi.isbn = '0000000000001' mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.comments = baike.get_summary() if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in iteritems(identifiers): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in iteritems((read_user_metadata(root, prefixes, refines) or {})): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item( root, prefixes, refines) return ans
def test_input_meta_single(self): stream_meta = get_metadata(self.get_stream('meta_single')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) # canon_meta.rating = float(0) # canon_meta.comments = '' canon_meta.tags = ['tag a', 'tag b'] canon_meta.set_identifiers({'isbn': '1234567890'}) self.compare_metadata(stream_meta, canon_meta)
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems(): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) return ans
def test_input_comment_single(self): stream_meta = get_metadata(self.get_stream('comment_single')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;' canon_meta.tags = ['tag d'] canon_meta.set_identifiers({ 'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre' }) self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_multi(self): stream_meta = get_metadata(self.get_stream('meta_multi')) canon_meta = Metadata( 'A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English', 'Spanish'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) canon_meta.rating = float(8) canon_meta.comments = 'meta "comments" ♥ HTML &amp;' canon_meta.tags = ['tag a', 'tag b', 'tag c'] canon_meta.set_identifiers({ 'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre' }) self.compare_metadata(stream_meta, canon_meta)
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata info = baike.get_info() logging.debug("\n".join("%s:\t%s" % v for v in info.items())) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' mi.provider_key = KEY mi.provider_value = baike.get_id() if self.copy_image and mi.cover_url: logging.debug("fetching cover: %s", mi.cover_url) img = io.BytesIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join("%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "網絡小說平台" plat = info.get(u'首發狀態', plat) plat = info.get(u'首發網站', plat) plat = plat.replace(u'首發', '') mi.publisher = info.get(u'連載平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完結' in info.get(u'連載狀態', ""): day = re.findall('\d*-\d*-\d*', info[u'連載狀態']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi