def get_metadata(self, md): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title) mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [ t['name'] for t in book['tags'] ][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [ info.get(u'作者', u'佚名') ] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary() ) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def get_baike_metadata(self, title): from baidubaike import Page try: baike = Page(title) except: return None info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = info.get(u'首发网站', None) if not plat: plat = info.get(u'首发状态', "网络小说平台") plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info[u'作者']] mi.isbn = '0000000000001' mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.comments = baike.get_summary() if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def read_metadata(root): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key != 'uuid': ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm return ans
def get_metadata(self, md, select): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title, md.author_sort, select) if not book: return None mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) #logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def _metadata(self, book): authors = [] if book['author']: for author in book['author']: for r in REMOVES: author = r.sub("", author) authors.append(author) if not authors: authors = [u'佚名'] from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO mi = Metadata(book['title']) mi.authors = authors mi.author_sort = mi.authors[0] mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) mi.website = "https://book.douban.com/isbn/%s" % mi.isbn mi.source = u'豆瓣' mi.cover_url = book['images']['large'] if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) logging.debug("=================\ndouban metadata:\n%s" % mi) return mi
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in iteritems(identifiers): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in iteritems((read_user_metadata(root, prefixes, refines) or {})): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item( root, prefixes, refines) return ans
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems(): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) return ans
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index( item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata info = baike.get_info() logging.debug("\n".join("%s:\t%s" % v for v in info.items())) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' mi.provider_key = KEY mi.provider_value = baike.get_id() if self.copy_image and mi.cover_url: logging.debug("fetching cover: %s", mi.cover_url) img = io.BytesIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join("%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "網絡小說平台" plat = info.get(u'首發狀態', plat) plat = info.get(u'首發網站', plat) plat = plat.replace(u'首發', '') mi.publisher = info.get(u'連載平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完結' in info.get(u'連載狀態', ""): day = re.findall('\d*-\d*-\d*', info[u'連載狀態']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError( 'Corrupted XMP metadata packet detected, probably generated by Nitro PDF' ) mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = [au for aus in authors for au in string_to_authors(aus)] tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = list(filter(None, map(canonicalize_lang, languages))) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in iteritems({ 'doi': check_doi, 'isbn': check_isbn }): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF') if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:'+x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = first_simple('//xmp:MetadataDate', root) if md: try: mi.metadata_date = parse_date(md) except: pass rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in { 'doi': check_doi, 'isbn': check_isbn }.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def mi(self): mi = Metadata(unicode_type(self.title.text()).strip() or _('Unknown')) mi.authors = string_to_authors(unicode_type(self.authors.text()).strip()) or [_('Unknown')] mi.languages = self.languages.lang_codes or [get_lang()] return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_("Unknown")) title = first_alt("//dc:title", root) if title: mi.title = title authors = multiple_sequences("//dc:creator", root) if authors: mi.authors = authors tags = multiple_sequences("//dc:subject", root) or multiple_sequences("//pdf:Keywords", root) if tags: mi.tags = tags comments = first_alt("//dc:description", root) if comments: mi.comments = comments publishers = multiple_sequences("//dc:publisher", root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date( first_sequence("//dc:date", root) or first_simple("//xmp:CreateDate", root), assume_utc=False ) except: pass else: mi.pubdate = pubdate bkp = first_simple("//xmp:CreatorTool", root) if bkp: mi.book_producer = bkp md = first_simple("//xmp:MetadataDate", root) if md: try: mi.metadata_date = parse_date(md) except: pass rating = first_simple("//calibre:rating", root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ("title_sort", "author_sort"): for elem in XPath("//calibre:" + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ("author_link_map", "user_categories"): val = first_simple("//calibre:" + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences("//dc:language", root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath("//xmp:Identifier")(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ("prism", "pdfx"): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple("//%s:%s" % (namespace, scheme), root) scheme = scheme.lower() if scheme == "isbn": val = check_isbn(val) elif scheme == "doi": val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {"doi": check_doi, "isbn": check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple("//dc:identifier", root)) if val: identifiers["doi"] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi