def parse_details(self, root): try: CBDB_id = self.parse_CBDB_id(self.url) except: self.log.exception('Error parsing CBDB id for url: %r' % self.url) CBDB_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not CBDB_id: self.log.error('Could not find title/authors/CBDB id for %r' % self.url) self.log.error('CBDB: %r Title: %r Authors: %r' % (CBDB_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.identifiers['cbdb'] = CBDB_id mi.set_identifier('cbdb', CBDB_id) #self.log.info(CBDB_id) #self.log.info(mi.identifiers.get('cbdb', None)) self.CBDB_id = CBDB_id try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) # summary try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_urls = self.parse_covers(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_urls) #self.log.info('covers') #self.log.info(self.cover_urls) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate, isbn = self.parse_editions(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) mi.source_relevance = self.relevance mi.language = 'Czech' #self.log.info('self.CBDB_id = ' + str(self.CBDB_id )) if self.CBDB_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id) if self.cover_urls: self.plugin.cache_identifier_to_cover_url( self.CBDB_id, self.cover_urls) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def reset_info(self): self.show_data(Metadata(_('Unknown')))
def identify( log, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, allowed_plugins=None): if title == _('Unknown'): title = None if authors == [_('Unknown')]: authors = None start_time = time.time() plugins = [ p for p in metadata_plugins(['identify']) if p.is_configured() and ( allowed_plugins is None or p.name in allowed_plugins) ] kwargs = { 'title': title, 'authors': authors, 'identifiers': identifiers, 'timeout': timeout, } log('Running identify query with parameters:') log(kwargs) log('Using plugins:', ', '.join(['%s %s' % (p.name, p.version) for p in plugins])) log('The log from individual plugins is below') workers = [Worker(p, kwargs, abort) for p in plugins] for w in workers: w.start() first_result_at = None results = {} for p in plugins: results[p] = [] logs = dict([(w.plugin, w.buf) for w in workers]) def get_results(): found = False for w in workers: try: result = w.rq.get_nowait() except Empty: pass else: results[w.plugin].append(result) found = True return found wait_time = msprefs['wait_after_first_identify_result'] while True: time.sleep(0.2) if get_results() and first_result_at is None: first_result_at = time.time() if not is_worker_alive(workers): break if (first_result_at is not None and time.time() - first_result_at > wait_time): log.warn('Not waiting any longer for more results. Still running' ' sources:') for worker in workers: if worker.is_alive(): log.debug('\t' + worker.name) abort.set() break while not abort.is_set() and get_results(): pass sort_kwargs = dict(kwargs) for k in list(sort_kwargs.iterkeys()): if k not in ('title', 'authors', 'identifiers'): sort_kwargs.pop(k) longest, lp = -1, '' for plugin, presults in results.iteritems(): presults.sort(key=plugin.identify_results_keygen(**sort_kwargs)) # Throw away lower priority results from the same source that have exactly the same # title and authors as a higher priority result filter_results = set() filtered_results = [] for r in presults: key = (r.title, tuple(r.authors)) if key not in filter_results: filtered_results.append(r) filter_results.add(key) results[plugin] = presults = filtered_results plog = logs[plugin].getvalue().strip() log('\n' + '*' * 30, plugin.name, '%s' % (plugin.version, ), '*' * 30) log('Found %d results' % len(presults)) time_spent = getattr(plugin, 'dl_time_spent', None) if time_spent is None: log('Downloading was aborted') longest, lp = -1, plugin.name else: log('Downloading from', plugin.name, 'took', time_spent) if time_spent > longest: longest, lp = time_spent, plugin.name for r in presults: log('\n\n---') try: log(unicode(r)) except TypeError: log(repr(r)) if plog: log(plog) log('\n' + '*' * 80) dummy = Metadata(_('Unknown')) for i, result in enumerate(presults): for f in plugin.prefs['ignore_fields']: if ':' not in f: setattr(result, f, getattr(dummy, f)) if f == 'series': result.series_index = dummy.series_index result.relevance_in_source = i result.has_cached_cover_url = (plugin.cached_cover_url_is_reliable and plugin.get_cached_cover_url( result.identifiers) is not None) result.identify_plugin = plugin if msprefs['txt_comments']: if plugin.has_html_comments and result.comments: result.comments = html2text(result.comments) log('The identify phase took %.2f seconds' % (time.time() - start_time)) log('The longest time (%f) was taken by:' % longest, lp) log('Merging results from different sources') start_time = time.time() results = merge_identify_results(results, log) log('We have %d merged results, merging took: %.2f seconds' % (len(results), time.time() - start_time)) tm_rules = msprefs['tag_map_rules'] if tm_rules: from calibre.ebooks.metadata.tag_mapper import map_tags am_rules = msprefs['author_map_rules'] if am_rules: from calibre.ebooks.metadata.author_mapper import map_authors, compile_rules am_rules = compile_rules(am_rules) max_tags = msprefs['max_tags'] for r in results: if tm_rules: r.tags = map_tags(r.tags, tm_rules) r.tags = r.tags[:max_tags] if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year: r.pubdate = None if msprefs['swap_author_names']: for r in results: def swap_to_ln_fn(a): if ',' in a: return a parts = a.split(None) if len(parts) <= 1: return a surname = parts[-1] return '%s, %s' % (surname, ' '.join(parts[:-1])) r.authors = [swap_to_ln_fn(a) for a in r.authors] if am_rules: for r in results: new_authors = map_authors(r.authors, am_rules) if new_authors != r.authors: r.authors = new_authors r.author_sort = authors_to_sort_string(r.authors) return results
def __init__(self, prefix, lpath, title=None, authors=None, mime=None, date=None, ContentType=None, thumbnail_name=None, size=None, other=None): from calibre.utils.date import parse_date # debug_print('Book::__init__ - title=', title) show_debug = title is not None and title.lower().find("xxxxx") >= 0 if other is not None: other.title = title other.published_date = date if show_debug: debug_print("Book::__init__ - title=", title, 'authors=', authors) debug_print("Book::__init__ - other=", other) super(Book, self).__init__(prefix, lpath, size, other) if title is not None and len(title) > 0: self.title = title if authors is not None and len(authors) > 0: self.authors_from_string(authors) if self.author_sort is None or self.author_sort == "Unknown": self.author_sort = author_to_author_sort(authors) self.mime = mime self.size = size # will be set later if None if ContentType == '6' and date is not None: try: self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") except: try: self.datetime = time.strptime( date.split('+')[0], "%Y-%m-%dT%H:%M:%S") except: try: self.datetime = time.strptime( date.split('+')[0], "%Y-%m-%d") except: try: self.datetime = parse_date( date, assume_utc=True).timetuple() except: try: self.datetime = time.gmtime( os.path.getctime(self.path)) except: self.datetime = time.gmtime() self.kobo_metadata = Metadata(title, self.authors) self.contentID = None self.current_shelves = [] self.kobo_collections = [] self.can_put_on_shelves = True self.kobo_series = None self.kobo_series_number = None # Kobo stores the series number as string. And it can have a leading "#". self.kobo_series_id = None self.kobo_subtitle = None if thumbnail_name is not None: self.thumbnail = ImageWrapper(thumbnail_name) if show_debug: debug_print("Book::__init__ end - self=", self) debug_print("Book::__init__ end - title=", title, 'authors=', authors)
def validate(self, x): from calibre.ebooks.metadata.book.base import Metadata return self.safe_format(x, {}, 'VALIDATE ERROR', Metadata(''))
def test_legacy_direct(self): # {{{ 'Test read-only methods that are directly equivalent in the old and new interface' from calibre.ebooks.metadata.book.base import Metadata from datetime import timedelta ndb = self.init_legacy(self.cloned_library) db = self.init_old() newstag = ndb.new_api.get_item_id('tags', 'news') self.assertEqual(dict(db.prefs), dict(ndb.prefs)) for meth, args in iteritems({ 'find_identical_books': [(Metadata('title one', ['author one']), ), (Metadata('unknown'), ), (Metadata('xxxx'), )], 'get_books_for_category': [('tags', newstag), ('#formats', 'FMT1')], 'get_next_series_num_for': [('A Series One', )], 'get_id_from_uuid': [('ddddd', ), (db.uuid(1, True), )], 'cover': [(0, ), (1, ), (2, )], 'get_author_id': [('author one', ), ('unknown', ), ('xxxxx', )], 'series_id': [(0, ), (1, ), (2, )], 'publisher_id': [(0, ), (1, ), (2, )], '@tags_older_than': [ ('News', None), ('Tag One', None), ('xxxx', None), ('Tag One', None, 'News'), ('News', None, 'xxxx'), ('News', None, None, ['xxxxxxx']), ('News', None, 'Tag One', ['Author Two', 'Author One']), ('News', timedelta(0), None, None), ('News', timedelta(100000)), ], 'format': [(1, 'FMT1', True), (2, 'FMT1', True), (0, 'xxxxxx')], 'has_format': [(1, 'FMT1', True), (2, 'FMT1', True), (0, 'xxxxxx')], 'sizeof_format': [(1, 'FMT1', True), (2, 'FMT1', True), (0, 'xxxxxx')], '@format_files': [(0, ), (1, ), (2, )], 'formats': [(0, ), (1, ), (2, )], 'max_size': [(0, ), (1, ), (2, )], 'format_hash': [(1, 'FMT1'), (1, 'FMT2'), (2, 'FMT1')], 'author_sort_from_authors': [(['Author One', 'Author Two', 'Unknown'], )], 'has_book': [(Metadata('title one'), ), (Metadata('xxxx1111'), )], 'has_id': [(1, ), (2, ), (3, ), (9999, )], 'id': [ (1, ), (2, ), (0, ), ], 'index': [ (1, ), (2, ), (3, ), ], 'row': [ (1, ), (2, ), (3, ), ], 'is_empty': [()], 'count': [()], 'all_author_names': [()], 'all_tag_names': [()], 'all_series_names': [()], 'all_publisher_names': [()], '!all_authors': [()], '!all_tags2': [()], '@all_tags': [()], '@get_all_identifier_types': [()], '!all_publishers': [()], '!all_titles': [()], '!all_series': [()], 'standard_field_keys': [()], 'all_field_keys': [()], 'searchable_fields': [()], 'search_term_to_field_key': [('author', ), ('tag', )], 'metadata_for_field': [('title', ), ('tags', )], 'sortable_field_keys': [()], 'custom_field_keys': [(True, ), (False, )], '!get_usage_count_by_id': [('authors', ), ('tags', ), ('series', ), ('publisher', ), ('#tags', ), ('languages', )], 'get_field': [(1, 'title'), (2, 'tags'), (0, 'rating'), (1, 'authors'), (2, 'series'), (1, '#tags')], 'all_formats': [()], 'get_authors_with_ids': [()], '!get_tags_with_ids': [()], '!get_series_with_ids': [()], '!get_publishers_with_ids': [()], '!get_ratings_with_ids': [()], '!get_languages_with_ids': [()], 'tag_name': [(3, )], 'author_name': [(3, )], 'series_name': [(3, )], 'authors_sort_strings': [(0, ), (1, ), (2, )], 'author_sort_from_book': [(0, ), (1, ), (2, )], 'authors_with_sort_strings': [(0, ), (1, ), (2, )], 'book_on_device_string': [(1, ), (2, ), (3, )], 'books_in_series_of': [(0, ), (1, ), (2, )], 'books_with_same_title': [(Metadata(db.title(0)), ), (Metadata(db.title(1)), ), (Metadata('1234'), )], }): fmt = lambda x: x if meth[0] in {'!', '@'}: fmt = {'!': dict, '@': frozenset}[meth[0]] meth = meth[1:] elif meth == 'get_authors_with_ids': fmt = lambda val: {x[0]: tuple(x[1:]) for x in val} for a in args: self.assertEqual( fmt(getattr(db, meth)(*a)), fmt(getattr(ndb, meth)(*a)), 'The method: %s() returned different results for argument %s' % (meth, a)) def f( x, y ): # get_top_level_move_items is broken in the old db on case-insensitive file systems x.discard('metadata_db_prefs_backup.json') return x, y self.assertEqual(f(*db.get_top_level_move_items()), f(*ndb.get_top_level_move_items())) d1, d2 = BytesIO(), BytesIO() db.copy_cover_to(1, d1, True) ndb.copy_cover_to(1, d2, True) self.assertTrue(d1.getvalue() == d2.getvalue()) d1, d2 = BytesIO(), BytesIO() db.copy_format_to(1, 'FMT1', d1, True) ndb.copy_format_to(1, 'FMT1', d2, True) self.assertTrue(d1.getvalue() == d2.getvalue()) old = db.get_data_as_dict(prefix='test-prefix') new = ndb.get_data_as_dict(prefix='test-prefix') for o, n in zip(old, new): o = { type('')(k) if isinstance(k, bytes) else k: set(v) if isinstance(v, list) else v for k, v in iteritems(o) } n = { k: set(v) if isinstance(v, list) else v for k, v in iteritems(n) } self.assertEqual(o, n) ndb.search('title:Unknown') db.search('title:Unknown') self.assertEqual(db.row(3), ndb.row(3)) self.assertRaises(ValueError, ndb.row, 2) self.assertRaises(ValueError, db.row, 2) db.close()
def test_legacy_setters(self): # {{{ 'Test methods that are directly equivalent in the old and new interface' from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import now n = now() ndb = self.init_legacy(self.cloned_library) amap = ndb.new_api.get_id_map('authors') sorts = [(aid, 's%d' % aid) for aid in amap] db = self.init_old(self.cloned_library) run_funcs(self, db, ndb, ( ('+format_metadata', 1, 'FMT1', itemgetter('size')), ('+format_metadata', 1, 'FMT2', itemgetter('size')), ('+format_metadata', 2, 'FMT1', itemgetter('size')), ('get_tags', 0), ('get_tags', 1), ('get_tags', 2), ('is_tag_used', 'News'), ('is_tag_used', 'xchkjgfh'), ('bulk_modify_tags', (1, ), ['t1'], ['News']), ('bulk_modify_tags', (2, ), ['t1'], ['Tag One', 'Tag Two']), ('bulk_modify_tags', (3, ), ['t1', 't2', 't3']), (db.clean, ), ('@all_tags', ), ('@tags', 0), ('@tags', 1), ('@tags', 2), ('unapply_tags', 1, ['t1']), ('unapply_tags', 2, ['xxxx']), ('unapply_tags', 3, ['t2', 't3']), (db.clean, ), ('@all_tags', ), ('@tags', 0), ('@tags', 1), ('@tags', 2), ('update_last_modified', (1, ), True, n), ('update_last_modified', (3, ), True, n), ('metadata_last_modified', 1, True), ('metadata_last_modified', 3, True), ('set_sort_field_for_author', sorts[0][0], sorts[0][1]), ('set_sort_field_for_author', sorts[1][0], sorts[1][1]), ('set_sort_field_for_author', sorts[2][0], sorts[2][1]), ('set_link_field_for_author', sorts[0][0], sorts[0][1]), ('set_link_field_for_author', sorts[1][0], sorts[1][1]), ('set_link_field_for_author', sorts[2][0], sorts[2][1]), (db.refresh, ), ('author_sort', 0), ('author_sort', 1), ('author_sort', 2), )) omi = [db.get_metadata(x) for x in (0, 1, 2)] nmi = [ndb.get_metadata(x) for x in (0, 1, 2)] self.assertEqual([x.author_sort_map for x in omi], [x.author_sort_map for x in nmi]) self.assertEqual([x.author_link_map for x in omi], [x.author_link_map for x in nmi]) db.close() ndb = self.init_legacy(self.cloned_library) db = self.init_old(self.cloned_library) run_funcs(self, db, ndb, ( ( 'set_authors', 1, ('author one', ), ), ('set_authors', 2, ('author two', ), True, True, True), ('set_author_sort', 3, 'new_aus'), ('set_comment', 1, ''), ('set_comment', 2, None), ('set_comment', 3, '<p>a comment</p>'), ('set_has_cover', 1, True), ('set_has_cover', 2, True), ('set_has_cover', 3, 1), ('set_identifiers', 2, { 'test': '', 'a': 'b' }), ('set_identifiers', 3, { 'id': '1', 'isbn': '9783161484100' }), ('set_identifiers', 1, {}), ('set_languages', 1, ('en', )), ('set_languages', 2, ()), ('set_languages', 3, ('deu', 'spa', 'fra')), ('set_pubdate', 1, None), ('set_pubdate', 2, '2011-1-7'), ('set_series', 1, 'a series one'), ('set_series', 2, 'another series [7]'), ('set_series', 3, 'a third series'), ('set_publisher', 1, 'publisher two'), ('set_publisher', 2, None), ('set_publisher', 3, 'a third puB'), ('set_rating', 1, 2.3), ('set_rating', 2, 0), ('set_rating', 3, 8), ('set_timestamp', 1, None), ('set_timestamp', 2, '2011-1-7'), ('set_uuid', 1, None), ('set_uuid', 2, 'a test uuid'), ('set_title', 1, 'title two'), ('set_title', 2, None), ('set_title', 3, 'The Test Title'), ('set_tags', 1, ['a1', 'a2'], True), ('set_tags', 2, ['b1', 'tag one'], False, False, False, True), ('set_tags', 3, ['A1']), (db.refresh, ), ('title', 0), ('title', 1), ('title', 2), ('title_sort', 0), ('title_sort', 1), ('title_sort', 2), ('authors', 0), ('authors', 1), ('authors', 2), ('author_sort', 0), ('author_sort', 1), ('author_sort', 2), ('has_cover', 3), ('has_cover', 1), ('has_cover', 2), ('get_identifiers', 0), ('get_identifiers', 1), ('get_identifiers', 2), ('pubdate', 0), ('pubdate', 1), ('pubdate', 2), ('timestamp', 0), ('timestamp', 1), ('timestamp', 2), ('publisher', 0), ('publisher', 1), ('publisher', 2), ('rating', 0), ('+rating', 1, lambda x: x or 0), ('rating', 2), ('series', 0), ('series', 1), ('series', 2), ('series_index', 0), ('series_index', 1), ('series_index', 2), ('uuid', 0), ('uuid', 1), ('uuid', 2), ('isbn', 0), ('isbn', 1), ('isbn', 2), ('@tags', 0), ('@tags', 1), ('@tags', 2), ('@all_tags', ), ('@get_all_identifier_types', ), ('set_title_sort', 1, 'Title Two'), ('set_title_sort', 2, None), ('set_title_sort', 3, 'The Test Title_sort'), ('set_series_index', 1, 2.3), ('set_series_index', 2, 0), ('set_series_index', 3, 8), ('set_identifier', 1, 'moose', 'val'), ('set_identifier', 2, 'test', ''), ('set_identifier', 3, '', ''), (db.refresh, ), ('series_index', 0), ('series_index', 1), ('series_index', 2), ('title_sort', 0), ('title_sort', 1), ('title_sort', 2), ('get_identifiers', 0), ('get_identifiers', 1), ('get_identifiers', 2), ('@get_all_identifier_types', ), ('set_metadata', 1, Metadata( 'title', ('a1', )), False, False, False, True, True), ('set_metadata', 3, Metadata('title', ('a1', ))), (db.refresh, ), ('title', 0), ('title', 1), ('title', 2), ('title_sort', 0), ('title_sort', 1), ('title_sort', 2), ('authors', 0), ('authors', 1), ('authors', 2), ('author_sort', 0), ('author_sort', 1), ('author_sort', 2), ('@tags', 0), ('@tags', 1), ('@tags', 2), ('@all_tags', ), ('@get_all_identifier_types', ), )) db.close() ndb = self.init_legacy(self.cloned_library) db = self.init_old(self.cloned_library) run_funcs(self, db, ndb, ( ('set', 0, 'title', 'newtitle'), ('set', 0, 'tags', 't1,t2,tag one', True), ('set', 0, 'authors', 'author one & Author Two', True), ('set', 0, 'rating', 3.2), ('set', 0, 'publisher', 'publisher one', False), (db.refresh, ), ('title', 0), ('rating', 0), ('#tags', 0), ('#tags', 1), ('#tags', 2), ('authors', 0), ('authors', 1), ('authors', 2), ('publisher', 0), ('publisher', 1), ('publisher', 2), ('delete_tag', 'T1'), ('delete_tag', 'T2'), ('delete_tag', 'Tag one'), ('delete_tag', 'News'), (db.clean, ), (db.refresh, ), ('@all_tags', ), ('#tags', 0), ('#tags', 1), ('#tags', 2), )) db.close() ndb = self.init_legacy(self.cloned_library) db = self.init_old(self.cloned_library) run_funcs(self, db, ndb, ( ('remove_all_tags', (1, 2, 3)), (db.clean, ), ('@all_tags', ), ('@tags', 0), ('@tags', 1), ('@tags', 2), )) db.close() ndb = self.init_legacy(self.cloned_library) db = self.init_old(self.cloned_library) a = {v: k for k, v in iteritems(ndb.new_api.get_id_map('authors')) }['Author One'] t = {v: k for k, v in iteritems(ndb.new_api.get_id_map('tags'))}['Tag One'] s = {v: k for k, v in iteritems(ndb.new_api.get_id_map('series')) }['A Series One'] p = {v: k for k, v in iteritems(ndb.new_api.get_id_map('publisher')) }['Publisher One'] run_funcs(self, db, ndb, ( ('rename_author', a, 'Author Two'), ('rename_tag', t, 'News'), ('rename_series', s, 'ss'), ('rename_publisher', p, 'publisher one'), (db.clean, ), (db.refresh, ), ('@all_tags', ), ('tags', 0), ('tags', 1), ('tags', 2), ('series', 0), ('series', 1), ('series', 2), ('publisher', 0), ('publisher', 1), ('publisher', 2), ('series_index', 0), ('series_index', 1), ('series_index', 2), ('authors', 0), ('authors', 1), ('authors', 2), ('author_sort', 0), ('author_sort', 1), ('author_sort', 2), )) db.close()
if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx) if __name__ == '__main__': from calibre.ebooks.metadata.book.base import Metadata mi = Metadata('Test book', authors=('Kovid Goyal', )) path = sys.argv[-1] ext = path.rpartition('.')[-1].lower() if ext not in valid_empty_formats: print('Unsupported format:', ext) raise SystemExit(1) create_book(mi, path, fmt=ext)
def test_set_metadata(self): # {{{ ' Test setting of metadata ' ae = self.assertEqual cache = self.init_cache(self.cloned_library) # Check that changing title/author updates the path mi = cache.get_metadata(1) old_path = cache.field_for('path', 1) old_title, old_author = mi.title, mi.authors[0] ae(old_path, '%s/%s (1)' % (old_author, old_title)) mi.title, mi.authors = 'New Title', ['New Author'] cache.set_metadata(1, mi) ae(cache.field_for('path', 1), '%s/%s (1)' % (mi.authors[0], mi.title)) p = cache.format_abspath(1, 'FMT1') self.assertTrue(mi.authors[0] in p and mi.title in p) # Compare old and new set_metadata() db = self.init_old(self.cloned_library) mi = db.get_metadata(1, index_is_id=True, get_cover=True, cover_as_data=True) mi2 = db.get_metadata(3, index_is_id=True, get_cover=True, cover_as_data=True) db.set_metadata(2, mi) db.set_metadata(1, mi2, force_changes=True) oldmi = db.get_metadata(2, index_is_id=True, get_cover=True, cover_as_data=True) oldmi2 = db.get_metadata(1, index_is_id=True, get_cover=True, cover_as_data=True) db.close() del db cache = self.init_cache(self.cloned_library) cache.set_metadata(2, mi) nmi = cache.get_metadata(2, get_cover=True, cover_as_data=True) ae(oldmi.cover_data, nmi.cover_data) self.compare_metadata( nmi, oldmi, exclude={'last_modified', 'format_metadata', 'formats'}) cache.set_metadata(1, mi2, force_changes=True) nmi2 = cache.get_metadata(1, get_cover=True, cover_as_data=True) self.compare_metadata( nmi2, oldmi2, exclude={'last_modified', 'format_metadata', 'formats'}) cache = self.init_cache(self.cloned_library) mi = cache.get_metadata(1) otags = mi.tags mi.tags = [x.upper() for x in mi.tags] cache.set_metadata(3, mi) self.assertEqual(set(otags), set(cache.field_for('tags', 3)), 'case changes should not be allowed in set_metadata') # test that setting authors without author sort results in an # auto-generated authors sort mi = Metadata('empty', ['a1', 'a2']) cache.set_metadata(1, mi) self.assertEqual('a1 & a2', cache.field_for('author_sort', 1)) cache.set_sort_for_authors({cache.get_item_id('authors', 'a1'): 'xy'}) self.assertEqual('xy & a2', cache.field_for('author_sort', 1)) mi = Metadata('empty', ['a1']) cache.set_metadata(1, mi) self.assertEqual('xy', cache.field_for('author_sort', 1))
def test_input_title(self): stream_meta = get_metadata(self.get_stream('title')) canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')]) self.compare_metadata(stream_meta, canon_meta)
def parse_details(self, raw, root): asin = parse_asin(root, self.log, self.url) if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection')) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or _('Unknown') # Author authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages', ): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments', ): val = get(field) if val: setattr( mi, field, val.replace('&', '&').replace('<', '<').replace( '>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k, v) in iteritems(meta_tag_ids): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): self.load_config() # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt": authors = None if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info( "This plugin requires at least either ISBN, IDN, Title or Author(s)." ) return None queries = [] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: queries.append('num=' + idn) exact_search['idn'] = idn else: authors_v = [] title_v = [] if authors is not None: authors_v.append(' '.join(authors)) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=False))) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=True))) if title is not None: title_v.append(title) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False))) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))) if isbn is not None: exact_search['isbn'] = isbn # title and author if authors is not None and title is not None: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '" AND tit="' + authors[0] + '"') # title but no author elif authors is not None and title is None: for i in authors_v: if isbn is not None: queries.append('per="' + i + '" AND num="' + isbn + '"') else: queries.append('per="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('tit="' + authors[0] + '"') # author but no title elif authors is None and title is not None: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # Sort queries descending by length (assumption: longer query -> less but better results) #queries.sort(key=len) #queries.reverse() # remove duplicate queries uniqueQueries = [] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = {'marc21': 'http://www.loc.gov/MARC21/slim'} for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] # Title: Field 245 title_parts = [] # if a,n,p exist: series = a, series_index = n, title = p for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..", namespaces=ns): series_index = i.xpath(".//marc21:subfield[@code='n']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match: series_index = match.group(1) else: series_index = "0" # looks like sometimes DNB does not know the series index and uses something like "[...]" series_index = series_index.replace(',', '.') series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() title_parts.append( i.xpath(".//marc21:subfield[@code='p']", namespaces=ns)[0].text.strip()) log.info("Extracted Series: %s" % series) log.info("Extracted Series Index: %s" % series_index) break # otherwise: title = a if len(title_parts) == 0: for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 1 for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) log.info("Extracted Title: %s" % title) # Title_Sort title_sort_parts = list(title_parts) title_sort_regex = re.match( '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$', title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join( filter(None, [ title_sort_regex.group(1).strip(), title_sort_regex.group(3).strip(), ", " + sortword ])) title_sort = " : ".join(title_sort_parts) log.info("Extracted Title_Sort: %s" % title_sort) # Authors authors = [] author_sort = None for i in record.xpath( ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # primary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len( authors ) == 0: # if no "real" autor was found take all persons involved for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len(authors) > 0: author_sort = authors[0] log.info("Extracted Authors: %s" % " & ".join(authors)) # Comments for i in record.xpath( ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]", namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = sanitize_comments_html(comments) log.info('Comments: %s' % comments) break except: log.info("Could not download Comments from %s" % i) # Publisher Name and Location publisher_name = None publisher_location = None fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() log.info("Extracted Publisher: %s" % publisher_name) log.info("Extracted Publisher Location: %s" % publisher_location) # Publishing Date for i in record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]", namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 2) break log.info("Extracted Publication Year: %s" % pubdate) # ID: IDN for i in record.xpath( ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): idn = i.text.strip() break log.info("Extracted ID IDN: %s" % idn) if "idn" in exact_search: if idn != exact_search["idn"]: log.info( "Extracted IDN does not match book's IDN, skipping record" ) continue # ID: URN for i in record.xpath( ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): urn = i.text.strip() break log.info("Extracted ID URN: %s" % urn) # ID: ISBN for i in record.xpath( ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-', '') break log.info("Extracted ID ISBN: %s" % isbn) if "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info( "Extracted ISBN does not match book's ISBN, skipping record" ) continue # ID: Sachgruppe (DDC) for i in record.xpath( ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): ddc.append(i.text.strip()) log.info("Extracted ID DDC: %s" % ",".join(ddc)) # Series and Series_Index if series is None and series_index is None: for i in record.xpath( ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',', '.') log.info("Extracted Series Index: %s" % series_index) # Series series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() log.info("Extracted Series: %s" % series) break # Try to extract Series, Series Index and Title from the fetched title. # Caution: This overwrites DNB's series/series_index and modifies the title! if self.cfg_guess_series is True: guessed_series = None guessed_series_index = None parts = re.split("[:]", self.removeSortingCharacters(title)) if len(parts) == 2: if bool(re.search("\d", parts[0])) != bool( re.search("\d", parts[1])): # figure out which part contains the index if bool(re.search("\d", parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.match( "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart ) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # from Titleparts like: "Name of the series - Episode 2" OK match = re.match( "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart else: # from Titleparts like: "Episode 2 Name of the series" match = re.match( "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$", indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart elif len(parts) == 1: # from Titles like: "Name of the series - Title (Episode 2)" match = re.match( "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) title = match.group(2) else: # from Titles like: "Name of the series - Episode 2" match = re.match( "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) title = guessed_series + " : Band " + guessed_series_index if guessed_series is not None and guessed_series_index is not None: series = guessed_series series_index = guessed_series_index log.info("Guessed Series: %s" % series) log.info("Guessed Series Index: %s" % series_index) # GND Subjects from 689 for i in record.xpath( ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): subjects_gnd.append(i.text.strip()) # GND Subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) # Non-GND subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;', i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i) < 2: subjects_non_gnd.remove(i) log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) # Edition for i in record.xpath( ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): edition = i.text.strip() break log.info("Extracted Edition: %s" % edition) # Languages for i in record.xpath( ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): languages.append(i.text.strip()) if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) # Put it all together if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata( self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " : ".join( filter(None, [ publisher_location, self.removeSortingCharacters(publisher_name) ])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn', urn) mi.set_identifier('dnb-idn', idn) mi.set_identifier('ddc', ",".join(ddc)) if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 1: if len(subjects_gnd) > 0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd) > 0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: %s" % mi) result_queue.put(mi)
def set_mi(self, mi, fm): ''' This sets the metadata for the test result books table. It doesn't reset the contents of the field selectors for editing rules. ''' self.fm = fm if mi: if not isinstance(mi, list): mi = (mi, ) else: mi = Metadata(_('Title'), [_('Author')]) mi.author_sort = _('Author Sort') mi.series = ngettext('Series', 'Series', 1) mi.series_index = 3 mi.rating = 4.0 mi.tags = [_('Tag 1'), _('Tag 2')] mi.languages = ['eng'] mi.id = 1 if self.fm is not None: mi.set_all_user_metadata(self.fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui fm = get_gui().current_db.new_api.field_metadata mi.set_all_user_metadata(fm.custom_field_metadata()) for col in mi.get_all_user_metadata(False): if fm[col]['datatype'] == 'datetime': mi.set(col, DEFAULT_DATE) elif fm[col]['datatype'] in ('int', 'float', 'rating'): mi.set(col, 2) elif fm[col]['datatype'] == 'bool': mi.set(col, False) elif fm[col]['is_multiple']: mi.set(col, (col, )) else: mi.set(col, col, 1) mi = (mi, ) self.mi = mi tv = self.template_value tv.setColumnCount(2) tv.setHorizontalHeaderLabels((_('Book title'), _('Template value'))) tv.horizontalHeader().setStretchLastSection(True) tv.horizontalHeader().sectionResized.connect(self.table_column_resized) tv.setRowCount(len(mi)) # Set the height of the table h = tv.rowHeight(0) * min(len(mi), 5) h += 2 * tv.frameWidth() + tv.horizontalHeader().height() tv.setMinimumHeight(h) tv.setMaximumHeight(h) # Set the size of the title column if self.table_column_widths: tv.setColumnWidth(0, self.table_column_widths[0]) else: tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10) tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) tv.setRowCount(len(mi)) # Use our own widget to get rid of elision. setTextElideMode() doesn't work for r in range(0, len(mi)): w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 0, w) w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 1, w) self.display_values('')
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') url = XPath('descendant::atom:link[@rel="self"]/@href') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') # print(etree.tostring(entry_, pretty_print=True)) def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] details_url = url(entry_)[0] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google': google_id} try: raw = get_details(browser, details_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = type('')(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]' ): mi.has_google_cover = x.get('href') break return mi
def __init__(self, parent, text, mi=None, fm=None, color_field=None, icon_field_key=None, icon_rule_kind=None, doing_emblem=False, text_is_placeholder=False, dialog_is_st_editor=False, global_vars=None, all_functions=None, builtin_functions=None): QDialog.__init__(self, parent) Ui_TemplateDialog.__init__(self) self.setupUi(self) self.coloring = color_field is not None self.iconing = icon_field_key is not None self.embleming = doing_emblem self.dialog_is_st_editor = dialog_is_st_editor if global_vars is None: self.global_vars = {} else: self.global_vars = global_vars cols = [] if fm is not None: for key in sorted( displayable_columns(fm), key=lambda k: sort_key(fm[k]['name'] if k != color_row_key else 0)): if key == color_row_key and not self.coloring: continue from calibre.gui2.preferences.coloring import all_columns_string name = all_columns_string if key == color_row_key else fm[key][ 'name'] if name: cols.append((name, key)) self.color_layout.setVisible(False) self.icon_layout.setVisible(False) if self.coloring: self.color_layout.setVisible(True) for n1, k1 in cols: self.colored_field.addItem( n1 + (' (' + k1 + ')' if k1 != color_row_key else ''), k1) self.colored_field.setCurrentIndex( self.colored_field.findData(color_field)) elif self.iconing or self.embleming: self.icon_layout.setVisible(True) if self.embleming: self.icon_kind_label.setVisible(False) self.icon_kind.setVisible(False) self.icon_chooser_label.setVisible(False) self.icon_field.setVisible(False) for n1, k1 in cols: self.icon_field.addItem('{} ({})'.format(n1, k1), k1) self.icon_file_names = [] d = os.path.join(config_dir, 'cc_icons') if os.path.exists(d): for icon_file in os.listdir(d): icon_file = icu_lower(icon_file) if os.path.exists(os.path.join(d, icon_file)): if icon_file.endswith('.png'): self.icon_file_names.append(icon_file) self.icon_file_names.sort(key=sort_key) self.update_filename_box() if self.iconing: dex = 0 from calibre.gui2.preferences.coloring import icon_rule_kinds for i, tup in enumerate(icon_rule_kinds): txt, val = tup self.icon_kind.addItem(txt, userData=(val)) if val == icon_rule_kind: dex = i self.icon_kind.setCurrentIndex(dex) self.icon_field.setCurrentIndex( self.icon_field.findData(icon_field_key)) if dialog_is_st_editor: self.buttonBox.setVisible(False) else: self.new_doc_label.setVisible(False) self.new_doc.setVisible(False) self.template_name_label.setVisible(False) self.template_name.setVisible(False) if mi: if not isinstance(mi, list): mi = (mi, ) else: mi = Metadata(_('Title'), [_('Author')]) mi.author_sort = _('Author Sort') mi.series = ngettext('Series', 'Series', 1) mi.series_index = 3 mi.rating = 4.0 mi.tags = [_('Tag 1'), _('Tag 2')] mi.languages = ['eng'] mi.id = 1 if fm is not None: mi.set_all_user_metadata(fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui mi.set_all_user_metadata(get_gui( ).current_db.new_api.field_metadata.custom_field_metadata()) for col in mi.get_all_user_metadata(False): mi.set(col, (col, ), 0) mi = (mi, ) self.mi = mi # Set up the display table self.table_column_widths = None try: self.table_column_widths = \ gprefs.get('template_editor_table_widths', None) except: pass tv = self.template_value tv.setRowCount(len(mi)) tv.setColumnCount(2) tv.setHorizontalHeaderLabels((_('Book title'), _('Template value'))) tv.horizontalHeader().setStretchLastSection(True) tv.horizontalHeader().sectionResized.connect(self.table_column_resized) # Set the height of the table h = tv.rowHeight(0) * min(len(mi), 5) h += 2 * tv.frameWidth() + tv.horizontalHeader().height() tv.setMinimumHeight(h) tv.setMaximumHeight(h) # Set the size of the title column if self.table_column_widths: tv.setColumnWidth(0, self.table_column_widths[0]) else: tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10) # Use our own widget to get rid of elision. setTextElideMode() doesn't work for r in range(0, len(mi)): w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 0, w) w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 1, w) tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) # Remove help icon on title bar icon = self.windowIcon() self.setWindowFlags(self.windowFlags() & (~Qt.WindowType.WindowContextHelpButtonHint)) self.setWindowIcon(icon) self.all_functions = all_functions if all_functions else formatter_functions( ).get_functions() self.builtins = (builtin_functions if builtin_functions else formatter_functions().get_builtins_and_aliases()) self.last_text = '' self.highlighter = TemplateHighlighter(self.textbox.document(), builtin_functions=self.builtins) self.textbox.cursorPositionChanged.connect(self.text_cursor_changed) self.textbox.textChanged.connect(self.textbox_changed) self.textbox.setFont(self.get_current_font()) self.textbox.setTabStopWidth(10) self.source_code.setTabStopWidth(10) self.documentation.setReadOnly(True) self.source_code.setReadOnly(True) if text is not None: if text_is_placeholder: self.textbox.setPlaceholderText(text) self.textbox.clear() text = '' else: self.textbox.setPlainText(text) else: text = '' self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText( _('&OK')) self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText( _('&Cancel')) self.color_copy_button.clicked.connect(self.color_to_clipboard) self.filename_button.clicked.connect(self.filename_button_clicked) self.icon_copy_button.clicked.connect(self.icon_to_clipboard) try: with open(P('template-functions.json'), 'rb') as f: self.builtin_source_dict = json.load(f, encoding='utf-8') except: self.builtin_source_dict = {} func_names = sorted(self.all_functions) self.function.clear() self.function.addItem('') for f in func_names: self.function.addItem( '{} -- {}'.format( f, self.function_type_string(f, longform=False)), f) self.function.setCurrentIndex(0) self.function.currentIndexChanged.connect(self.function_changed) self.display_values(text) self.rule = (None, '') tt = _('Template language tutorial') self.template_tutorial.setText( '<a href="%s">%s</a>' % (localize_user_manual_link( 'https://manual.calibre-ebook.com/template_lang.html'), tt)) tt = _('Template function reference') self.template_func_reference.setText( '<a href="%s">%s</a>' % (localize_user_manual_link( 'https://manual.calibre-ebook.com/generated/en/template_ref.html' ), tt)) s = gprefs.get('template_editor_break_on_print', False) self.go_button.setEnabled(s) self.remove_all_button.setEnabled(s) self.set_all_button.setEnabled(s) self.toggle_button.setEnabled(s) self.breakpoint_line_box.setEnabled(s) self.breakpoint_line_box_label.setEnabled(s) self.break_box.setChecked(s) self.break_box.stateChanged.connect(self.break_box_changed) self.go_button.clicked.connect(self.go_button_pressed) self.textbox.setFocus() self.set_up_font_boxes() self.toggle_button.clicked.connect(self.toggle_button_pressed) self.remove_all_button.clicked.connect(self.remove_all_button_pressed) self.set_all_button.clicked.connect(self.set_all_button_pressed) self.load_button.clicked.connect(self.load_template) self.save_button.clicked.connect(self.save_template) self.textbox.setWordWrapMode(QTextOption.WordWrap) self.textbox.setContextMenuPolicy( Qt.ContextMenuPolicy.CustomContextMenu) self.textbox.customContextMenuRequested.connect(self.show_context_menu) # Now geometry try: geom = gprefs.get('template_editor_dialog_geometry', None) if geom is not None: QApplication.instance().safe_restore_geometry( self, QByteArray(geom)) except Exception: pass
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi
def __init__(self, parent, text, mi=None, fm=None, color_field=None, icon_field_key=None, icon_rule_kind=None, doing_emblem=False, text_is_placeholder=False, dialog_is_st_editor=False, global_vars={}, all_functions=None, builtin_functions=None): QDialog.__init__(self, parent) Ui_TemplateDialog.__init__(self) self.setupUi(self) self.coloring = color_field is not None self.iconing = icon_field_key is not None self.embleming = doing_emblem self.dialog_is_st_editor = dialog_is_st_editor self.global_vars = global_vars cols = [] if fm is not None: for key in sorted(displayable_columns(fm), key=lambda k: sort_key(fm[k]['name'] if k != color_row_key else 0)): if key == color_row_key and not self.coloring: continue from calibre.gui2.preferences.coloring import all_columns_string name = all_columns_string if key == color_row_key else fm[key]['name'] if name: cols.append((name, key)) self.color_layout.setVisible(False) self.icon_layout.setVisible(False) if self.coloring: self.color_layout.setVisible(True) for n1, k1 in cols: self.colored_field.addItem(n1 + (' (' + k1 + ')' if k1 != color_row_key else ''), k1) self.colored_field.setCurrentIndex(self.colored_field.findData(color_field)) elif self.iconing or self.embleming: self.icon_layout.setVisible(True) if self.embleming: self.icon_kind_label.setVisible(False) self.icon_kind.setVisible(False) self.icon_chooser_label.setVisible(False) self.icon_field.setVisible(False) for n1, k1 in cols: self.icon_field.addItem('{} ({})'.format(n1, k1), k1) self.icon_file_names = [] d = os.path.join(config_dir, 'cc_icons') if os.path.exists(d): for icon_file in os.listdir(d): icon_file = icu_lower(icon_file) if os.path.exists(os.path.join(d, icon_file)): if icon_file.endswith('.png'): self.icon_file_names.append(icon_file) self.icon_file_names.sort(key=sort_key) self.update_filename_box() if self.iconing: dex = 0 from calibre.gui2.preferences.coloring import icon_rule_kinds for i,tup in enumerate(icon_rule_kinds): txt,val = tup self.icon_kind.addItem(txt, userData=(val)) if val == icon_rule_kind: dex = i self.icon_kind.setCurrentIndex(dex) self.icon_field.setCurrentIndex(self.icon_field.findData(icon_field_key)) if dialog_is_st_editor: self.buttonBox.setVisible(False) else: self.new_doc_label.setVisible(False) self.new_doc.setVisible(False) self.template_name_label.setVisible(False) self.template_name.setVisible(False) if mi: self.mi = mi else: self.mi = Metadata(_('Title'), [_('Author')]) self.mi.author_sort = _('Author Sort') self.mi.series = ngettext('Series', 'Series', 1) self.mi.series_index = 3 self.mi.rating = 4.0 self.mi.tags = [_('Tag 1'), _('Tag 2')] self.mi.languages = ['eng'] self.mi.id = 1 if fm is not None: self.mi.set_all_user_metadata(fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui self.mi.set_all_user_metadata( get_gui().current_db.new_api.field_metadata.custom_field_metadata()) for col in self.mi.get_all_user_metadata(False): self.mi.set(col, (col,), 0) # Remove help icon on title bar icon = self.windowIcon() self.setWindowFlags(self.windowFlags()&(~Qt.WindowType.WindowContextHelpButtonHint)) self.setWindowIcon(icon) self.all_functions = all_functions if all_functions else formatter_functions().get_functions() self.builtins = builtin_functions if builtin_functions else formatter_functions().get_builtins() self.last_text = '' self.highlighter = TemplateHighlighter(self.textbox.document(), builtin_functions=self.builtins) self.textbox.cursorPositionChanged.connect(self.text_cursor_changed) self.textbox.textChanged.connect(self.textbox_changed) self.textbox.setTabStopWidth(10) self.source_code.setTabStopWidth(10) self.documentation.setReadOnly(True) self.source_code.setReadOnly(True) if text is not None: if text_is_placeholder: self.textbox.setPlaceholderText(text) self.textbox.clear() else: self.textbox.setPlainText(text) self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText(_('&OK')) self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText(_('&Cancel')) self.color_copy_button.clicked.connect(self.color_to_clipboard) self.filename_button.clicked.connect(self.filename_button_clicked) self.icon_copy_button.clicked.connect(self.icon_to_clipboard) try: with open(P('template-functions.json'), 'rb') as f: self.builtin_source_dict = json.load(f, encoding='utf-8') except: self.builtin_source_dict = {} func_names = sorted(self.all_functions) self.function.clear() self.function.addItem('') for f in func_names: self.function.addItem('{} -- {}'.format(f, self.function_type_string(f, longform=False)), f) self.function.setCurrentIndex(0) self.function.currentIndexChanged.connect(self.function_changed) self.textbox_changed() self.rule = (None, '') tt = _('Template language tutorial') self.template_tutorial.setText( '<a href="%s">%s</a>' % ( localize_user_manual_link('https://manual.calibre-ebook.com/template_lang.html'), tt)) tt = _('Template function reference') self.template_func_reference.setText( '<a href="%s">%s</a>' % ( localize_user_manual_link('https://manual.calibre-ebook.com/generated/en/template_ref.html'), tt)) self.font_size_box.setValue(gprefs['gpm_template_editor_font_size']) self.font_size_box.valueChanged.connect(self.font_size_changed)
def test_legacy_adding_books(self): # {{{ 'Test various adding/deleting books methods' from calibre.ebooks.metadata.book.base import Metadata from calibre.ptempfile import TemporaryFile legacy, old = self.init_legacy(self.cloned_library), self.init_old( self.cloned_library) mi = Metadata('Added Book0', authors=('Added Author', )) with TemporaryFile(suffix='.aff') as name: with open(name, 'wb') as f: f.write(b'xxx') T = partial(ET, 'add_books', ([name], ['AFF'], [mi]), old=old, legacy=legacy) T()(self) book_id = T(kwargs={'return_ids': True})(self)[1][0] self.assertEqual(legacy.new_api.formats(book_id), ('AFF', )) T(kwargs={'add_duplicates': False})(self) mi.title = 'Added Book1' mi.uuid = 'uuu' T = partial(ET, 'import_book', (mi, [name]), old=old, legacy=legacy) book_id = T()(self) self.assertNotEqual(legacy.uuid(book_id, index_is_id=True), old.uuid(book_id, index_is_id=True)) book_id = T(kwargs={'preserve_uuid': True})(self) self.assertEqual(legacy.uuid(book_id, index_is_id=True), old.uuid(book_id, index_is_id=True)) self.assertEqual(legacy.new_api.formats(book_id), ('AFF', )) T = partial(ET, 'add_format', old=old, legacy=legacy) T((0, 'AFF', BytesIO(b'fffff')))(self) T((0, 'AFF', BytesIO(b'fffff')))(self) T((0, 'AFF', BytesIO(b'fffff')), {'replace': True})(self) with TemporaryFile(suffix='.opf') as name: with open(name, 'wb') as f: f.write(b'zzzz') T = partial(ET, 'import_book', (mi, [name]), old=old, legacy=legacy) book_id = T()(self) self.assertFalse(legacy.new_api.formats(book_id)) mi.title = 'Added Book2' T = partial(ET, 'create_book_entry', (mi, ), old=old, legacy=legacy) T() T({'add_duplicates': False}) T({'force_id': 1000}) with TemporaryFile(suffix='.txt') as name: with open(name, 'wb') as f: f.write(b'tttttt') bid = legacy.add_catalog(name, 'My Catalog') self.assertEqual(old.add_catalog(name, 'My Catalog'), bid) cache = legacy.new_api self.assertEqual(cache.formats(bid), ('TXT', )) self.assertEqual(cache.field_for('title', bid), 'My Catalog') self.assertEqual(cache.field_for('authors', bid), ('calibre', )) self.assertEqual(cache.field_for('tags', bid), (_('Catalog'), )) self.assertTrue(bid < legacy.add_catalog(name, 'Something else')) self.assertEqual(legacy.add_catalog(name, 'My Catalog'), bid) self.assertEqual(old.add_catalog(name, 'My Catalog'), bid) bid = legacy.add_news( name, { 'title': 'Events', 'add_title_tag': True, 'custom_tags': ('one', 'two') }) self.assertEqual(cache.formats(bid), ('TXT', )) self.assertEqual(cache.field_for('authors', bid), ('calibre', )) self.assertEqual(cache.field_for('tags', bid), (_('News'), 'Events', 'one', 'two')) self.assertTrue(legacy.cover(1, index_is_id=True)) origcov = legacy.cover(1, index_is_id=True) self.assertTrue(legacy.has_cover(1)) legacy.remove_cover(1) self.assertFalse(legacy.has_cover(1)) self.assertFalse(legacy.cover(1, index_is_id=True)) legacy.set_cover(3, origcov) self.assertEqual(legacy.cover(3, index_is_id=True), origcov) self.assertTrue(legacy.has_cover(3)) self.assertTrue(legacy.format(1, 'FMT1', index_is_id=True)) legacy.remove_format(1, 'FMT1', index_is_id=True) self.assertIsNone(legacy.format(1, 'FMT1', index_is_id=True)) legacy.delete_book(1) old.delete_book(1) self.assertNotIn(1, legacy.all_ids()) legacy.dump_metadata((2, 3)) old.close()
import json from calibre.utils.logging import Log from calibre.ebooks.metadata.book.base import Metadata from calibre_plugins.crossref_doi_download import DoiMeta from calibre_plugins.crossref_doi_download.doi_reader import DoiReader, get_title, get_author_list reader = DoiReader(Log()) dm = DoiMeta('./plugin/') br = dm.browser fullurl = 'https://api.crossref.org/works?query.bibliographic=Bayesian+data+analysis+Andrew+Gelman&mailto=vikoya5988%40oniaj.com' cdata = br.open(fullurl).read() data = json.loads(cdata) message = data['message'] results = message['items'] identifiers = {} # fin = map(lambda x:reader.result2meta(x,identifiers),results) for r in results: reader.result2meta(r,identifiers) result = results[1] title = get_title(result) authors = get_author_list(result) mi = Metadata(title=title, authors=authors) from calibre import ipython ipython(locals())
def test_sorting(self): # {{{ 'Test sorting' cache = self.init_cache() ae = self.assertEqual for field, order in { 'title': [2, 1, 3], 'authors': [2, 1, 3], 'series': [3, 1, 2], 'tags': [3, 1, 2], 'rating': [3, 2, 1], # 'identifiers': [3, 2, 1], There is no stable sort since 1 and # 2 have the same identifier keys # 'last_modified': [3, 2, 1], There is no stable sort as two # records have the exact same value 'timestamp': [2, 1, 3], 'pubdate': [1, 2, 3], 'publisher': [3, 2, 1], 'languages': [3, 2, 1], 'comments': [3, 2, 1], '#enum': [3, 2, 1], '#authors': [3, 2, 1], '#date': [3, 1, 2], '#rating': [3, 2, 1], '#series': [3, 2, 1], '#tags': [3, 2, 1], '#yesno': [2, 1, 3], '#comments': [3, 2, 1], 'id': [1, 2, 3], }.iteritems(): x = list(reversed(order)) ae(order, cache.multisort([(field, True)], ids_to_sort=x), 'Ascending sort of %s failed' % field) ae(x, cache.multisort([(field, False)], ids_to_sort=order), 'Descending sort of %s failed' % field) # Test sorting of is_multiple fields. # Author like fields should be sorted by generating sort names from the # actual values in entry order for field in ('authors', '#authors'): ae( cache.set_field( field, { 1: ('aa bb', 'bb cc', 'cc dd'), 2: ('bb aa', 'xx yy'), 3: ('aa bb', 'bb aa') }), {1, 2, 3}) ae([2, 3, 1], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) ae([1, 3, 2], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) # All other is_multiple fields should be sorted by sorting the values # for each book and using that as the sort key for field in ('tags', '#tags'): ae( cache.set_field(field, { 1: ('b', 'a'), 2: ('c', 'y'), 3: ('b', 'z') }), {1, 2, 3}) ae([1, 3, 2], cache.multisort([(field, True)], ids_to_sort=(1, 2, 3))) ae([2, 3, 1], cache.multisort([(field, False)], ids_to_sort=(1, 2, 3))) # Test tweak to sort dates by visible format from calibre.utils.date import parse_only_date as p from calibre.utils.config_base import Tweak ae( cache.set_field('pubdate', { 1: p('2001-3-3'), 2: p('2002-2-3'), 3: p('2003-1-3') }), {1, 2, 3}) ae([1, 2, 3], cache.multisort([('pubdate', True)])) with Tweak('gui_pubdate_display_format', 'MMM'), Tweak('sort_dates_using_visible_fields', True): c2 = self.init_cache() ae([3, 2, 1], c2.multisort([('pubdate', True)])) # Test bool sorting when not tristate cache.set_pref('bools_are_tristate', False) c2 = self.init_cache() ae([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)])) # Test subsorting ae([3, 2, 1], cache.multisort([('identifiers', True), ('title', True)]), 'Subsort failed') from calibre.ebooks.metadata.book.base import Metadata for i in xrange(7): cache.create_book_entry(Metadata('title%d' % i), apply_import_tags=False) cache.create_custom_column('one', 'CC1', 'int', False) cache.create_custom_column('two', 'CC2', 'int', False) cache.create_custom_column('three', 'CC3', 'int', False) cache.close() cache = self.init_cache() cache.set_field('#one', {(i + (5 * m)): m for m in (0, 1) for i in xrange(1, 6)}) cache.set_field('#two', {i + (m * 3): m for m in (0, 1, 2) for i in (1, 2, 3)}) cache.set_field('#two', {10: 2}) cache.set_field('#three', {i: i for i in xrange(1, 11)}) ae( list(xrange(1, 11)), cache.multisort([('#one', True), ('#two', True)], ids_to_sort=sorted(cache.all_book_ids()))) ae([4, 5, 1, 2, 3, 7, 8, 9, 10, 6], cache.multisort([('#one', True), ('#two', False)], ids_to_sort=sorted(cache.all_book_ids()))) ae([5, 4, 3, 2, 1, 10, 9, 8, 7, 6], cache.multisort([('#one', True), ('#two', False), ('#three', False)], ids_to_sort=sorted(cache.all_book_ids())))
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF') if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:'+x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def to_metadata(self, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow log.info("to_metadata") douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") # authors = "author" book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("cover") series = entry_.get("series") if not authors: authors = [("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = book_tags # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except BaseException: log.error("Failed to parse pubdate %r" % pubdate) if rating: try: # mi.publisher += "#PrB.rating#" + str(rating) mi.rating = rating / 2.0 except BaseException: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series return mi
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('author', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x unique_authors = [] # remove duplicates while retaining order for f in [fix_author(x) for x in authors]: if f not in unique_authors: unique_authors.append(f) mi = Metadata(title, unique_authors) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list( filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = from_base64_bytes(m[COVER_KEY]) fmt, w, h = identify(data) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def mi(self): mi = Metadata(unicode_type(self.title.text()).strip() or _('Unknown')) mi.authors = string_to_authors(unicode_type(self.authors.text()).strip()) or [_('Unknown')] mi.languages = self.languages.lang_codes or [get_lang()] return mi
def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers): from lxml import etree def tostring(x): if x is None: return '' return etree.tostring(x, method='text', encoding=unicode).strip() orig_isbn = identifiers.get('isbn', None) title_tokens = list(self.get_title_tokens(orig_title)) author_tokens = list(self.get_author_tokens(orig_authors)) results = [] def ismatch(title, authors): authors = lower(' '.join(authors)) title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break amatch = not author_tokens for a in author_tokens: if lower(a) in authors: amatch = True break if not author_tokens: amatch = True return match and amatch bl = feed.find('BookList') if bl is None: err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) for bd in bl.xpath('.//BookData'): isbn = check_isbn(bd.get('isbn', None)) isbn13 = check_isbn(bd.get('isbn13', None)) if not isbn and not isbn13: continue if orig_isbn and orig_isbn not in {isbn, isbn13}: continue title = tostring(bd.find('Title')) if not title: continue authors = [] for au in bd.xpath('.//Authors/Person'): au = tostring(au) if au: if ',' in au: ln, _, fn = au.partition(',') au = fn.strip() + ' ' + ln.strip() authors.append(au) if not authors: continue comments = tostring(bd.find('Summary')) id_ = (title, tuple(authors)) if id_ in seen: continue seen.add(id_) if not ismatch(title, authors): continue publisher = tostring(bd.find('PublisherText')) if not publisher: publisher = None if publisher and 'audio' in publisher.lower(): continue mi = Metadata(title, authors) mi.isbn = isbn mi.publisher = publisher mi.comments = comments results.append(mi) return total_results, shown_results, results
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings) / len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool( [r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg) / len(avg) ans.average_source_relevance = avg return ans
def get_metadata_(src, encoding=None): # Meta data definitions as in # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi
def start(self, title, authors, identifiers): book = Metadata(title, authors) book.identifiers = identifiers self.covers_widget.start(book, self.current_cover, title, authors, {}) return self.exec_()
def validate(self, x): from calibre.ebooks.metadata.book.base import Metadata self.book = Metadata('') return self.vformat(x, [], {})