def parse_details(self, root):
        try:
            CBDB_id = self.parse_CBDB_id(self.url)
        except:
            self.log.exception('Error parsing CBDB id for url: %r' % self.url)
            CBDB_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r' %
                               self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not CBDB_id:
            self.log.error('Could not find title/authors/CBDB id for %r' %
                           self.url)
            self.log.error('CBDB: %r Title: %r Authors: %r' %
                           (CBDB_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.identifiers['cbdb'] = CBDB_id
        mi.set_identifier('cbdb', CBDB_id)
        #self.log.info(CBDB_id)
        #self.log.info(mi.identifiers.get('cbdb', None))
        self.CBDB_id = CBDB_id

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        # summary
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_urls = self.parse_covers(root)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_urls)
        #self.log.info('covers')
        #self.log.info(self.cover_urls)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.publisher, mi.pubdate, isbn = self.parse_editions(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing publisher and date for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        mi.language = 'Czech'

        #self.log.info('self.CBDB_id = ' + str(self.CBDB_id ))

        if self.CBDB_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id)

            if self.cover_urls:
                self.plugin.cache_identifier_to_cover_url(
                    self.CBDB_id, self.cover_urls)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #2
0
 def reset_info(self):
     self.show_data(Metadata(_('Unknown')))
Beispiel #3
0
def identify(
        log,
        abort,  # {{{
        title=None,
        authors=None,
        identifiers={},
        timeout=30,
        allowed_plugins=None):
    if title == _('Unknown'):
        title = None
    if authors == [_('Unknown')]:
        authors = None
    start_time = time.time()

    plugins = [
        p for p in metadata_plugins(['identify']) if p.is_configured() and (
            allowed_plugins is None or p.name in allowed_plugins)
    ]

    kwargs = {
        'title': title,
        'authors': authors,
        'identifiers': identifiers,
        'timeout': timeout,
    }

    log('Running identify query with parameters:')
    log(kwargs)
    log('Using plugins:',
        ', '.join(['%s %s' % (p.name, p.version) for p in plugins]))
    log('The log from individual plugins is below')

    workers = [Worker(p, kwargs, abort) for p in plugins]
    for w in workers:
        w.start()

    first_result_at = None
    results = {}
    for p in plugins:
        results[p] = []
    logs = dict([(w.plugin, w.buf) for w in workers])

    def get_results():
        found = False
        for w in workers:
            try:
                result = w.rq.get_nowait()
            except Empty:
                pass
            else:
                results[w.plugin].append(result)
                found = True
        return found

    wait_time = msprefs['wait_after_first_identify_result']
    while True:
        time.sleep(0.2)

        if get_results() and first_result_at is None:
            first_result_at = time.time()

        if not is_worker_alive(workers):
            break

        if (first_result_at is not None
                and time.time() - first_result_at > wait_time):
            log.warn('Not waiting any longer for more results. Still running'
                     ' sources:')
            for worker in workers:
                if worker.is_alive():
                    log.debug('\t' + worker.name)
            abort.set()
            break

    while not abort.is_set() and get_results():
        pass

    sort_kwargs = dict(kwargs)
    for k in list(sort_kwargs.iterkeys()):
        if k not in ('title', 'authors', 'identifiers'):
            sort_kwargs.pop(k)

    longest, lp = -1, ''
    for plugin, presults in results.iteritems():
        presults.sort(key=plugin.identify_results_keygen(**sort_kwargs))

        # Throw away lower priority results from the same source that have exactly the same
        # title and authors as a higher priority result
        filter_results = set()
        filtered_results = []
        for r in presults:
            key = (r.title, tuple(r.authors))
            if key not in filter_results:
                filtered_results.append(r)
                filter_results.add(key)
        results[plugin] = presults = filtered_results

        plog = logs[plugin].getvalue().strip()
        log('\n' + '*' * 30, plugin.name, '%s' % (plugin.version, ), '*' * 30)
        log('Found %d results' % len(presults))
        time_spent = getattr(plugin, 'dl_time_spent', None)
        if time_spent is None:
            log('Downloading was aborted')
            longest, lp = -1, plugin.name
        else:
            log('Downloading from', plugin.name, 'took', time_spent)
            if time_spent > longest:
                longest, lp = time_spent, plugin.name
        for r in presults:
            log('\n\n---')
            try:
                log(unicode(r))
            except TypeError:
                log(repr(r))
        if plog:
            log(plog)
        log('\n' + '*' * 80)

        dummy = Metadata(_('Unknown'))
        for i, result in enumerate(presults):
            for f in plugin.prefs['ignore_fields']:
                if ':' not in f:
                    setattr(result, f, getattr(dummy, f))
                if f == 'series':
                    result.series_index = dummy.series_index
            result.relevance_in_source = i
            result.has_cached_cover_url = (plugin.cached_cover_url_is_reliable
                                           and plugin.get_cached_cover_url(
                                               result.identifiers) is not None)
            result.identify_plugin = plugin
            if msprefs['txt_comments']:
                if plugin.has_html_comments and result.comments:
                    result.comments = html2text(result.comments)

    log('The identify phase took %.2f seconds' % (time.time() - start_time))
    log('The longest time (%f) was taken by:' % longest, lp)
    log('Merging results from different sources')
    start_time = time.time()
    results = merge_identify_results(results, log)

    log('We have %d merged results, merging took: %.2f seconds' %
        (len(results), time.time() - start_time))
    tm_rules = msprefs['tag_map_rules']
    if tm_rules:
        from calibre.ebooks.metadata.tag_mapper import map_tags
    am_rules = msprefs['author_map_rules']
    if am_rules:
        from calibre.ebooks.metadata.author_mapper import map_authors, compile_rules
        am_rules = compile_rules(am_rules)

    max_tags = msprefs['max_tags']
    for r in results:
        if tm_rules:
            r.tags = map_tags(r.tags, tm_rules)
        r.tags = r.tags[:max_tags]
        if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year:
            r.pubdate = None

    if msprefs['swap_author_names']:
        for r in results:

            def swap_to_ln_fn(a):
                if ',' in a:
                    return a
                parts = a.split(None)
                if len(parts) <= 1:
                    return a
                surname = parts[-1]
                return '%s, %s' % (surname, ' '.join(parts[:-1]))

            r.authors = [swap_to_ln_fn(a) for a in r.authors]

    if am_rules:
        for r in results:
            new_authors = map_authors(r.authors, am_rules)
            if new_authors != r.authors:
                r.authors = new_authors
                r.author_sort = authors_to_sort_string(r.authors)

    return results
Beispiel #4
0
    def __init__(self,
                 prefix,
                 lpath,
                 title=None,
                 authors=None,
                 mime=None,
                 date=None,
                 ContentType=None,
                 thumbnail_name=None,
                 size=None,
                 other=None):
        from calibre.utils.date import parse_date
        #         debug_print('Book::__init__ - title=', title)
        show_debug = title is not None and title.lower().find("xxxxx") >= 0
        if other is not None:
            other.title = title
            other.published_date = date
        if show_debug:
            debug_print("Book::__init__ - title=", title, 'authors=', authors)
            debug_print("Book::__init__ - other=", other)
        super(Book, self).__init__(prefix, lpath, size, other)

        if title is not None and len(title) > 0:
            self.title = title

        if authors is not None and len(authors) > 0:
            self.authors_from_string(authors)
            if self.author_sort is None or self.author_sort == "Unknown":
                self.author_sort = author_to_author_sort(authors)

        self.mime = mime

        self.size = size  # will be set later if None

        if ContentType == '6' and date is not None:
            try:
                self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
            except:
                try:
                    self.datetime = time.strptime(
                        date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
                except:
                    try:
                        self.datetime = time.strptime(
                            date.split('+')[0], "%Y-%m-%d")
                    except:
                        try:
                            self.datetime = parse_date(
                                date, assume_utc=True).timetuple()
                        except:
                            try:
                                self.datetime = time.gmtime(
                                    os.path.getctime(self.path))
                            except:
                                self.datetime = time.gmtime()

        self.kobo_metadata = Metadata(title, self.authors)
        self.contentID = None
        self.current_shelves = []
        self.kobo_collections = []
        self.can_put_on_shelves = True
        self.kobo_series = None
        self.kobo_series_number = None  # Kobo stores the series number as string. And it can have a leading "#".
        self.kobo_series_id = None
        self.kobo_subtitle = None

        if thumbnail_name is not None:
            self.thumbnail = ImageWrapper(thumbnail_name)

        if show_debug:
            debug_print("Book::__init__ end - self=", self)
            debug_print("Book::__init__ end - title=", title, 'authors=',
                        authors)
Beispiel #5
0
 def validate(self, x):
     from calibre.ebooks.metadata.book.base import Metadata
     return self.safe_format(x, {}, 'VALIDATE ERROR', Metadata(''))
Beispiel #6
0
    def test_legacy_direct(self):  # {{{
        'Test read-only methods that are directly equivalent in the old and new interface'
        from calibre.ebooks.metadata.book.base import Metadata
        from datetime import timedelta
        ndb = self.init_legacy(self.cloned_library)
        db = self.init_old()
        newstag = ndb.new_api.get_item_id('tags', 'news')

        self.assertEqual(dict(db.prefs), dict(ndb.prefs))

        for meth, args in iteritems({
                'find_identical_books': [(Metadata('title one',
                                                   ['author one']), ),
                                         (Metadata('unknown'), ),
                                         (Metadata('xxxx'), )],
                'get_books_for_category': [('tags', newstag),
                                           ('#formats', 'FMT1')],
                'get_next_series_num_for': [('A Series One', )],
                'get_id_from_uuid': [('ddddd', ), (db.uuid(1, True), )],
                'cover': [(0, ), (1, ), (2, )],
                'get_author_id': [('author one', ), ('unknown', ),
                                  ('xxxxx', )],
                'series_id': [(0, ), (1, ), (2, )],
                'publisher_id': [(0, ), (1, ), (2, )],
                '@tags_older_than': [
                    ('News', None),
                    ('Tag One', None),
                    ('xxxx', None),
                    ('Tag One', None, 'News'),
                    ('News', None, 'xxxx'),
                    ('News', None, None, ['xxxxxxx']),
                    ('News', None, 'Tag One', ['Author Two', 'Author One']),
                    ('News', timedelta(0), None, None),
                    ('News', timedelta(100000)),
                ],
                'format': [(1, 'FMT1', True), (2, 'FMT1', True),
                           (0, 'xxxxxx')],
                'has_format': [(1, 'FMT1', True), (2, 'FMT1', True),
                               (0, 'xxxxxx')],
                'sizeof_format': [(1, 'FMT1', True), (2, 'FMT1', True),
                                  (0, 'xxxxxx')],
                '@format_files': [(0, ), (1, ), (2, )],
                'formats': [(0, ), (1, ), (2, )],
                'max_size': [(0, ), (1, ), (2, )],
                'format_hash': [(1, 'FMT1'), (1, 'FMT2'), (2, 'FMT1')],
                'author_sort_from_authors':
            [(['Author One', 'Author Two', 'Unknown'], )],
                'has_book': [(Metadata('title one'), ),
                             (Metadata('xxxx1111'), )],
                'has_id': [(1, ), (2, ), (3, ), (9999, )],
                'id': [
                    (1, ),
                    (2, ),
                    (0, ),
                ],
                'index': [
                    (1, ),
                    (2, ),
                    (3, ),
                ],
                'row': [
                    (1, ),
                    (2, ),
                    (3, ),
                ],
                'is_empty': [()],
                'count': [()],
                'all_author_names': [()],
                'all_tag_names': [()],
                'all_series_names': [()],
                'all_publisher_names': [()],
                '!all_authors': [()],
                '!all_tags2': [()],
                '@all_tags': [()],
                '@get_all_identifier_types': [()],
                '!all_publishers': [()],
                '!all_titles': [()],
                '!all_series': [()],
                'standard_field_keys': [()],
                'all_field_keys': [()],
                'searchable_fields': [()],
                'search_term_to_field_key': [('author', ), ('tag', )],
                'metadata_for_field': [('title', ), ('tags', )],
                'sortable_field_keys': [()],
                'custom_field_keys': [(True, ), (False, )],
                '!get_usage_count_by_id': [('authors', ), ('tags', ),
                                           ('series', ), ('publisher', ),
                                           ('#tags', ), ('languages', )],
                'get_field': [(1, 'title'), (2, 'tags'), (0, 'rating'),
                              (1, 'authors'), (2, 'series'), (1, '#tags')],
                'all_formats': [()],
                'get_authors_with_ids': [()],
                '!get_tags_with_ids': [()],
                '!get_series_with_ids': [()],
                '!get_publishers_with_ids': [()],
                '!get_ratings_with_ids': [()],
                '!get_languages_with_ids': [()],
                'tag_name': [(3, )],
                'author_name': [(3, )],
                'series_name': [(3, )],
                'authors_sort_strings': [(0, ), (1, ), (2, )],
                'author_sort_from_book': [(0, ), (1, ), (2, )],
                'authors_with_sort_strings': [(0, ), (1, ), (2, )],
                'book_on_device_string': [(1, ), (2, ), (3, )],
                'books_in_series_of': [(0, ), (1, ), (2, )],
                'books_with_same_title': [(Metadata(db.title(0)), ),
                                          (Metadata(db.title(1)), ),
                                          (Metadata('1234'), )],
        }):
            fmt = lambda x: x
            if meth[0] in {'!', '@'}:
                fmt = {'!': dict, '@': frozenset}[meth[0]]
                meth = meth[1:]
            elif meth == 'get_authors_with_ids':
                fmt = lambda val: {x[0]: tuple(x[1:]) for x in val}
            for a in args:
                self.assertEqual(
                    fmt(getattr(db, meth)(*a)), fmt(getattr(ndb, meth)(*a)),
                    'The method: %s() returned different results for argument %s'
                    % (meth, a))

        def f(
            x, y
        ):  # get_top_level_move_items is broken in the old db on case-insensitive file systems
            x.discard('metadata_db_prefs_backup.json')
            return x, y

        self.assertEqual(f(*db.get_top_level_move_items()),
                         f(*ndb.get_top_level_move_items()))
        d1, d2 = BytesIO(), BytesIO()
        db.copy_cover_to(1, d1, True)
        ndb.copy_cover_to(1, d2, True)
        self.assertTrue(d1.getvalue() == d2.getvalue())
        d1, d2 = BytesIO(), BytesIO()
        db.copy_format_to(1, 'FMT1', d1, True)
        ndb.copy_format_to(1, 'FMT1', d2, True)
        self.assertTrue(d1.getvalue() == d2.getvalue())
        old = db.get_data_as_dict(prefix='test-prefix')
        new = ndb.get_data_as_dict(prefix='test-prefix')
        for o, n in zip(old, new):
            o = {
                type('')(k) if isinstance(k, bytes) else k:
                set(v) if isinstance(v, list) else v
                for k, v in iteritems(o)
            }
            n = {
                k: set(v) if isinstance(v, list) else v
                for k, v in iteritems(n)
            }
            self.assertEqual(o, n)

        ndb.search('title:Unknown')
        db.search('title:Unknown')
        self.assertEqual(db.row(3), ndb.row(3))
        self.assertRaises(ValueError, ndb.row, 2)
        self.assertRaises(ValueError, db.row, 2)
        db.close()
Beispiel #7
0
    def test_legacy_setters(self):  # {{{
        'Test methods that are directly equivalent in the old and new interface'
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import now
        n = now()
        ndb = self.init_legacy(self.cloned_library)
        amap = ndb.new_api.get_id_map('authors')
        sorts = [(aid, 's%d' % aid) for aid in amap]
        db = self.init_old(self.cloned_library)
        run_funcs(self, db, ndb, (
            ('+format_metadata', 1, 'FMT1', itemgetter('size')),
            ('+format_metadata', 1, 'FMT2', itemgetter('size')),
            ('+format_metadata', 2, 'FMT1', itemgetter('size')),
            ('get_tags', 0),
            ('get_tags', 1),
            ('get_tags', 2),
            ('is_tag_used', 'News'),
            ('is_tag_used', 'xchkjgfh'),
            ('bulk_modify_tags', (1, ), ['t1'], ['News']),
            ('bulk_modify_tags', (2, ), ['t1'], ['Tag One', 'Tag Two']),
            ('bulk_modify_tags', (3, ), ['t1', 't2', 't3']),
            (db.clean, ),
            ('@all_tags', ),
            ('@tags', 0),
            ('@tags', 1),
            ('@tags', 2),
            ('unapply_tags', 1, ['t1']),
            ('unapply_tags', 2, ['xxxx']),
            ('unapply_tags', 3, ['t2', 't3']),
            (db.clean, ),
            ('@all_tags', ),
            ('@tags', 0),
            ('@tags', 1),
            ('@tags', 2),
            ('update_last_modified', (1, ), True, n),
            ('update_last_modified', (3, ), True, n),
            ('metadata_last_modified', 1, True),
            ('metadata_last_modified', 3, True),
            ('set_sort_field_for_author', sorts[0][0], sorts[0][1]),
            ('set_sort_field_for_author', sorts[1][0], sorts[1][1]),
            ('set_sort_field_for_author', sorts[2][0], sorts[2][1]),
            ('set_link_field_for_author', sorts[0][0], sorts[0][1]),
            ('set_link_field_for_author', sorts[1][0], sorts[1][1]),
            ('set_link_field_for_author', sorts[2][0], sorts[2][1]),
            (db.refresh, ),
            ('author_sort', 0),
            ('author_sort', 1),
            ('author_sort', 2),
        ))
        omi = [db.get_metadata(x) for x in (0, 1, 2)]
        nmi = [ndb.get_metadata(x) for x in (0, 1, 2)]
        self.assertEqual([x.author_sort_map for x in omi],
                         [x.author_sort_map for x in nmi])
        self.assertEqual([x.author_link_map for x in omi],
                         [x.author_link_map for x in nmi])
        db.close()

        ndb = self.init_legacy(self.cloned_library)
        db = self.init_old(self.cloned_library)

        run_funcs(self, db, ndb, (
            (
                'set_authors',
                1,
                ('author one', ),
            ),
            ('set_authors', 2, ('author two', ), True, True, True),
            ('set_author_sort', 3, 'new_aus'),
            ('set_comment', 1, ''),
            ('set_comment', 2, None),
            ('set_comment', 3, '<p>a comment</p>'),
            ('set_has_cover', 1, True),
            ('set_has_cover', 2, True),
            ('set_has_cover', 3, 1),
            ('set_identifiers', 2, {
                'test': '',
                'a': 'b'
            }),
            ('set_identifiers', 3, {
                'id': '1',
                'isbn': '9783161484100'
            }),
            ('set_identifiers', 1, {}),
            ('set_languages', 1, ('en', )),
            ('set_languages', 2, ()),
            ('set_languages', 3, ('deu', 'spa', 'fra')),
            ('set_pubdate', 1, None),
            ('set_pubdate', 2, '2011-1-7'),
            ('set_series', 1, 'a series one'),
            ('set_series', 2, 'another series [7]'),
            ('set_series', 3, 'a third series'),
            ('set_publisher', 1, 'publisher two'),
            ('set_publisher', 2, None),
            ('set_publisher', 3, 'a third puB'),
            ('set_rating', 1, 2.3),
            ('set_rating', 2, 0),
            ('set_rating', 3, 8),
            ('set_timestamp', 1, None),
            ('set_timestamp', 2, '2011-1-7'),
            ('set_uuid', 1, None),
            ('set_uuid', 2, 'a test uuid'),
            ('set_title', 1, 'title two'),
            ('set_title', 2, None),
            ('set_title', 3, 'The Test Title'),
            ('set_tags', 1, ['a1', 'a2'], True),
            ('set_tags', 2, ['b1', 'tag one'], False, False, False, True),
            ('set_tags', 3, ['A1']),
            (db.refresh, ),
            ('title', 0),
            ('title', 1),
            ('title', 2),
            ('title_sort', 0),
            ('title_sort', 1),
            ('title_sort', 2),
            ('authors', 0),
            ('authors', 1),
            ('authors', 2),
            ('author_sort', 0),
            ('author_sort', 1),
            ('author_sort', 2),
            ('has_cover', 3),
            ('has_cover', 1),
            ('has_cover', 2),
            ('get_identifiers', 0),
            ('get_identifiers', 1),
            ('get_identifiers', 2),
            ('pubdate', 0),
            ('pubdate', 1),
            ('pubdate', 2),
            ('timestamp', 0),
            ('timestamp', 1),
            ('timestamp', 2),
            ('publisher', 0),
            ('publisher', 1),
            ('publisher', 2),
            ('rating', 0),
            ('+rating', 1, lambda x: x or 0),
            ('rating', 2),
            ('series', 0),
            ('series', 1),
            ('series', 2),
            ('series_index', 0),
            ('series_index', 1),
            ('series_index', 2),
            ('uuid', 0),
            ('uuid', 1),
            ('uuid', 2),
            ('isbn', 0),
            ('isbn', 1),
            ('isbn', 2),
            ('@tags', 0),
            ('@tags', 1),
            ('@tags', 2),
            ('@all_tags', ),
            ('@get_all_identifier_types', ),
            ('set_title_sort', 1, 'Title Two'),
            ('set_title_sort', 2, None),
            ('set_title_sort', 3, 'The Test Title_sort'),
            ('set_series_index', 1, 2.3),
            ('set_series_index', 2, 0),
            ('set_series_index', 3, 8),
            ('set_identifier', 1, 'moose', 'val'),
            ('set_identifier', 2, 'test', ''),
            ('set_identifier', 3, '', ''),
            (db.refresh, ),
            ('series_index', 0),
            ('series_index', 1),
            ('series_index', 2),
            ('title_sort', 0),
            ('title_sort', 1),
            ('title_sort', 2),
            ('get_identifiers', 0),
            ('get_identifiers', 1),
            ('get_identifiers', 2),
            ('@get_all_identifier_types', ),
            ('set_metadata', 1, Metadata(
                'title', ('a1', )), False, False, False, True, True),
            ('set_metadata', 3, Metadata('title', ('a1', ))),
            (db.refresh, ),
            ('title', 0),
            ('title', 1),
            ('title', 2),
            ('title_sort', 0),
            ('title_sort', 1),
            ('title_sort', 2),
            ('authors', 0),
            ('authors', 1),
            ('authors', 2),
            ('author_sort', 0),
            ('author_sort', 1),
            ('author_sort', 2),
            ('@tags', 0),
            ('@tags', 1),
            ('@tags', 2),
            ('@all_tags', ),
            ('@get_all_identifier_types', ),
        ))
        db.close()

        ndb = self.init_legacy(self.cloned_library)
        db = self.init_old(self.cloned_library)

        run_funcs(self, db, ndb, (
            ('set', 0, 'title', 'newtitle'),
            ('set', 0, 'tags', 't1,t2,tag one', True),
            ('set', 0, 'authors', 'author one & Author Two', True),
            ('set', 0, 'rating', 3.2),
            ('set', 0, 'publisher', 'publisher one', False),
            (db.refresh, ),
            ('title', 0),
            ('rating', 0),
            ('#tags', 0),
            ('#tags', 1),
            ('#tags', 2),
            ('authors', 0),
            ('authors', 1),
            ('authors', 2),
            ('publisher', 0),
            ('publisher', 1),
            ('publisher', 2),
            ('delete_tag', 'T1'),
            ('delete_tag', 'T2'),
            ('delete_tag', 'Tag one'),
            ('delete_tag', 'News'),
            (db.clean, ),
            (db.refresh, ),
            ('@all_tags', ),
            ('#tags', 0),
            ('#tags', 1),
            ('#tags', 2),
        ))
        db.close()

        ndb = self.init_legacy(self.cloned_library)
        db = self.init_old(self.cloned_library)
        run_funcs(self, db, ndb, (
            ('remove_all_tags', (1, 2, 3)),
            (db.clean, ),
            ('@all_tags', ),
            ('@tags', 0),
            ('@tags', 1),
            ('@tags', 2),
        ))
        db.close()

        ndb = self.init_legacy(self.cloned_library)
        db = self.init_old(self.cloned_library)
        a = {v: k
             for k, v in iteritems(ndb.new_api.get_id_map('authors'))
             }['Author One']
        t = {v: k
             for k, v in iteritems(ndb.new_api.get_id_map('tags'))}['Tag One']
        s = {v: k
             for k, v in iteritems(ndb.new_api.get_id_map('series'))
             }['A Series One']
        p = {v: k
             for k, v in iteritems(ndb.new_api.get_id_map('publisher'))
             }['Publisher One']
        run_funcs(self, db, ndb, (
            ('rename_author', a, 'Author Two'),
            ('rename_tag', t, 'News'),
            ('rename_series', s, 'ss'),
            ('rename_publisher', p, 'publisher one'),
            (db.clean, ),
            (db.refresh, ),
            ('@all_tags', ),
            ('tags', 0),
            ('tags', 1),
            ('tags', 2),
            ('series', 0),
            ('series', 1),
            ('series', 2),
            ('publisher', 0),
            ('publisher', 1),
            ('publisher', 2),
            ('series_index', 0),
            ('series_index', 1),
            ('series_index', 2),
            ('authors', 0),
            ('authors', 1),
            ('authors', 2),
            ('author_sort', 0),
            ('author_sort', 1),
            ('author_sort', 2),
        ))
        db.close()
Beispiel #8
0
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name,
                                                                    ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name,
                          DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype',
                        b'application/epub+zip',
                        compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)


if __name__ == '__main__':
    from calibre.ebooks.metadata.book.base import Metadata
    mi = Metadata('Test book', authors=('Kovid Goyal', ))
    path = sys.argv[-1]
    ext = path.rpartition('.')[-1].lower()
    if ext not in valid_empty_formats:
        print('Unsupported format:', ext)
        raise SystemExit(1)
    create_book(mi, path, fmt=ext)
Beispiel #9
0
    def test_set_metadata(self):  # {{{
        ' Test setting of metadata '
        ae = self.assertEqual
        cache = self.init_cache(self.cloned_library)

        # Check that changing title/author updates the path
        mi = cache.get_metadata(1)
        old_path = cache.field_for('path', 1)
        old_title, old_author = mi.title, mi.authors[0]
        ae(old_path, '%s/%s (1)' % (old_author, old_title))
        mi.title, mi.authors = 'New Title', ['New Author']
        cache.set_metadata(1, mi)
        ae(cache.field_for('path', 1), '%s/%s (1)' % (mi.authors[0], mi.title))
        p = cache.format_abspath(1, 'FMT1')
        self.assertTrue(mi.authors[0] in p and mi.title in p)

        # Compare old and new set_metadata()
        db = self.init_old(self.cloned_library)
        mi = db.get_metadata(1,
                             index_is_id=True,
                             get_cover=True,
                             cover_as_data=True)
        mi2 = db.get_metadata(3,
                              index_is_id=True,
                              get_cover=True,
                              cover_as_data=True)
        db.set_metadata(2, mi)
        db.set_metadata(1, mi2, force_changes=True)
        oldmi = db.get_metadata(2,
                                index_is_id=True,
                                get_cover=True,
                                cover_as_data=True)
        oldmi2 = db.get_metadata(1,
                                 index_is_id=True,
                                 get_cover=True,
                                 cover_as_data=True)
        db.close()
        del db
        cache = self.init_cache(self.cloned_library)
        cache.set_metadata(2, mi)
        nmi = cache.get_metadata(2, get_cover=True, cover_as_data=True)
        ae(oldmi.cover_data, nmi.cover_data)
        self.compare_metadata(
            nmi,
            oldmi,
            exclude={'last_modified', 'format_metadata', 'formats'})
        cache.set_metadata(1, mi2, force_changes=True)
        nmi2 = cache.get_metadata(1, get_cover=True, cover_as_data=True)
        self.compare_metadata(
            nmi2,
            oldmi2,
            exclude={'last_modified', 'format_metadata', 'formats'})

        cache = self.init_cache(self.cloned_library)
        mi = cache.get_metadata(1)
        otags = mi.tags
        mi.tags = [x.upper() for x in mi.tags]
        cache.set_metadata(3, mi)
        self.assertEqual(set(otags), set(cache.field_for('tags', 3)),
                         'case changes should not be allowed in set_metadata')

        # test that setting authors without author sort results in an
        # auto-generated authors sort
        mi = Metadata('empty', ['a1', 'a2'])
        cache.set_metadata(1, mi)
        self.assertEqual('a1 & a2', cache.field_for('author_sort', 1))
        cache.set_sort_for_authors({cache.get_item_id('authors', 'a1'): 'xy'})
        self.assertEqual('xy & a2', cache.field_for('author_sort', 1))
        mi = Metadata('empty', ['a1'])
        cache.set_metadata(1, mi)
        self.assertEqual('xy', cache.field_for('author_sort', 1))
Beispiel #10
0
 def test_input_title(self):
     stream_meta = get_metadata(self.get_stream('title'))
     canon_meta = Metadata('A Title Tag &amp; Title Ⓒ', [_('Unknown')])
     self.compare_metadata(stream_meta, canon_meta)
Beispiel #11
0
    def parse_details(self, raw, root):
        asin = parse_asin(root, self.log, self.url)
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root, raw)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection'))
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #12
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)

    def get_all(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = [x.strip() for x in ans if x.strip()]
        if not ans:
            ans = None
        return ans

    def get(field):
        ans = get_all(field)
        if ans:
            ans = ans[0]
        return ans

    # Title
    title = get('title') or title_tag.strip() or _('Unknown')

    # Author
    authors = authors_to_string(get_all('authors')) or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title, string_to_authors(authors))

    # Single-value text fields
    for field in ('publisher', 'isbn'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    # Multi-value text fields
    for field in ('languages', ):
        val = get_all(field)
        if val:
            setattr(mi, field, val)

    # HTML fields
    for field in ('comments', ):
        val = get(field)
        if val:
            setattr(
                mi, field,
                val.replace('&', '&amp;').replace('<', '&lt;').replace(
                    '>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))

    # Date fields
    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 10:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get_all('tags')
    if tags:
        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    # IDENTIFIERS
    for (k, v) in iteritems(meta_tag_ids):
        v = [x.strip() for x in v if x.strip()]
        if v:
            mi.set_identifier(k, v[0])

    return mi
Beispiel #13
0
    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        self.load_config()

        # get identifying tags from book
        idn = identifiers.get('dnb-idn', None)
        isbn = check_isbn(identifiers.get('isbn', None))

        # ignore unknown authors
        if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt":
            authors = None

        if (isbn is None) and (idn is None) and (title is None) and (authors is
                                                                     None):
            log.info(
                "This plugin requires at least either ISBN, IDN, Title or Author(s)."
            )
            return None

        queries = []
        # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
        exact_search = {}

        if idn is not None:
            queries.append('num=' + idn)
            exact_search['idn'] = idn

        else:
            authors_v = []
            title_v = []

            if authors is not None:
                authors_v.append(' '.join(authors))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=False)))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=True)))

            if title is not None:
                title_v.append(title)
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=False)))
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=True)))

            if isbn is not None:
                exact_search['isbn'] = isbn

            # title and author
            if authors is not None and title is not None:
                for a in authors_v:
                    for t in title_v:
                        if isbn is not None:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '" AND num="' + isbn + '"')
                        else:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '" AND num="' + isbn + '"')
                else:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '"')

            # title but no author
            elif authors is not None and title is None:
                for i in authors_v:
                    if isbn is not None:
                        queries.append('per="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('per="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('tit="' + authors[0] + '" AND num="' +
                                   isbn + '"')
                else:
                    queries.append('tit="' + authors[0] + '"')

            # author but no title
            elif authors is None and title is not None:
                for i in title_v:
                    if isbn is not None:
                        queries.append('tit="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('tit="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND num="' + isbn +
                                   '"')
                else:
                    queries.append('per="' + title + '"')

            # as last resort only use isbn
            if isbn is not None:
                queries.append('num=' + isbn)

            # Sort queries descending by length (assumption: longer query -> less but better results)
            #queries.sort(key=len)
            #queries.reverse()

        # remove duplicate queries
        uniqueQueries = []
        for i in queries:
            if i not in uniqueQueries:
                uniqueQueries.append(i)

        # Process queries
        results = None

        for query in uniqueQueries:
            query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)'
            log.info(query)

            if self.cfg_dnb_token is None:
                results = self.getSearchResultsByScraping(log, query, timeout)
            else:
                results = self.getSearchResults(log, query, timeout)

            if results is None:
                continue

            log.info("Parsing records")

            ns = {'marc21': 'http://www.loc.gov/MARC21/slim'}
            for record in results:
                series = None
                series_index = None
                publisher = None
                pubdate = None
                languages = []
                title = None
                title_sort = None
                edition = None
                comments = None
                idn = None
                urn = None
                isbn = None
                ddc = []
                subjects_gnd = []
                subjects_non_gnd = []

                # Title: Field 245
                title_parts = []
                # if a,n,p exist: series = a, series_index = n, title = p
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..",
                        namespaces=ns):
                    series_index = i.xpath(".//marc21:subfield[@code='n']",
                                           namespaces=ns)[0].text.strip()
                    match = re.search("(\d+[,\.\d+]?)", series_index)
                    if match:
                        series_index = match.group(1)
                    else:
                        series_index = "0"  # looks like sometimes DNB does not know the series index and uses something like "[...]"
                    series_index = series_index.replace(',', '.')
                    series = i.xpath(".//marc21:subfield[@code='a']",
                                     namespaces=ns)[0].text.strip()
                    title_parts.append(
                        i.xpath(".//marc21:subfield[@code='p']",
                                namespaces=ns)[0].text.strip())
                    log.info("Extracted Series: %s" % series)
                    log.info("Extracted Series Index: %s" % series_index)
                    break
                # otherwise: title = a
                if len(title_parts) == 0:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):
                        title_parts.append(i.text.strip())
                        break

                # subtitle 1
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns):
                    title_parts.append(i.text.strip())
                    break

                # subtitle 2
                #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
                #    title = title + " / " + i.text.strip()
                #    break

                title = " : ".join(title_parts)
                log.info("Extracted Title: %s" % title)

                # Title_Sort
                title_sort_parts = list(title_parts)
                title_sort_regex = re.match(
                    '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$',
                    title_parts[0])
                sortword = title_sort_regex.group(2)
                if sortword:
                    title_sort_parts[0] = ''.join(
                        filter(None, [
                            title_sort_regex.group(1).strip(),
                            title_sort_regex.group(3).strip(), ", " + sortword
                        ]))
                title_sort = " : ".join(title_sort_parts)
                log.info("Extracted Title_Sort: %s" % title_sort)

                # Authors
                authors = []
                author_sort = None
                for i in record.xpath(
                        ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # primary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # secondary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                if len(
                        authors
                ) == 0:  # if no "real" autor was found take all persons involved
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):  # secondary authors
                        name = re.sub(" \[.*\]$", "", i.text.strip())
                        authors.append(name)
                if len(authors) > 0:
                    author_sort = authors[0]
                log.info("Extracted Authors: %s" % " & ".join(authors))

                # Comments
                for i in record.xpath(
                        ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",
                        namespaces=ns):
                    if i.text.startswith("http://deposit.dnb.de/"):
                        br = self.browser
                        log.info('Downloading Comments from: %s' % i.text)
                        try:
                            comments = br.open_novisit(i.text,
                                                       timeout=30).read()
                            comments = sanitize_comments_html(comments)
                            log.info('Comments: %s' % comments)
                            break
                        except:
                            log.info("Could not download Comments from %s" % i)

                # Publisher Name and Location
                publisher_name = None
                publisher_location = None
                fields = record.xpath(
                    ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                    namespaces=ns)
                if len(fields) > 0:
                    publisher_name = fields[0].xpath(
                        ".//marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                    publisher_location = fields[0].xpath(
                        ".//marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                else:
                    fields = record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",
                        namespaces=ns)
                    if len(fields) > 0:
                        publisher_name = fields[0].xpath(
                            ".//marc21:subfield[@code='b' and string-length(text())>0]",
                            namespaces=ns)[0].text.strip()
                    else:
                        fields = record.xpath(
                            ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",
                            namespaces=ns)
                        if len(fields) > 0:
                            publisher_location = fields[0].xpath(
                                ".//marc21:subfield[@code='a' and string-length(text())>0]",
                                namespaces=ns)[0].text.strip()

                log.info("Extracted Publisher: %s" % publisher_name)
                log.info("Extracted Publisher Location: %s" %
                         publisher_location)

                # Publishing Date
                for i in record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",
                        namespaces=ns):
                    match = re.search("(\d{4})", i.text.strip())
                    if match is not None:
                        year = match.group(1)
                        pubdate = datetime.datetime(int(year), 1, 2)
                        break
                log.info("Extracted Publication Year: %s" % pubdate)

                # ID: IDN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    idn = i.text.strip()
                    break
                log.info("Extracted ID IDN: %s" % idn)
                if "idn" in exact_search:
                    if idn != exact_search["idn"]:
                        log.info(
                            "Extracted IDN does not match book's IDN, skipping record"
                        )
                        continue

                # ID: URN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    urn = i.text.strip()
                    break
                log.info("Extracted ID URN: %s" % urn)

                # ID: ISBN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
                    match = re.search(isbn_regex, i.text.strip())
                    isbn = match.group()
                    isbn = isbn.replace('-', '')
                    break
                log.info("Extracted ID ISBN: %s" % isbn)
                if "isbn" in exact_search:
                    if isbn != exact_search["isbn"]:
                        log.info(
                            "Extracted ISBN does not match book's ISBN, skipping record"
                        )
                        continue

                # ID: Sachgruppe (DDC)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    ddc.append(i.text.strip())
                log.info("Extracted ID DDC: %s" % ",".join(ddc))

                # Series and Series_Index
                if series is None and series_index is None:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                            namespaces=ns):
                        # Series Index
                        series_index = i.xpath(".//marc21:subfield[@code='v']",
                                               namespaces=ns)[0].text.strip()
                        match = re.search("(\d+[,\.\d+]?)", series_index)
                        if match is not None:
                            series_index = match.group(1)
                        else:
                            series_index = "0"
                        series_index = series_index.replace(',', '.')
                        log.info("Extracted Series Index: %s" % series_index)
                        # Series
                        series = i.xpath(".//marc21:subfield[@code='a']",
                                         namespaces=ns)[0].text.strip()
                        log.info("Extracted Series: %s" % series)
                        break

                # Try to extract Series, Series Index and Title from the fetched title.
                # Caution: This overwrites DNB's series/series_index and modifies the title!
                if self.cfg_guess_series is True:
                    guessed_series = None
                    guessed_series_index = None
                    parts = re.split("[:]",
                                     self.removeSortingCharacters(title))
                    if len(parts) == 2:
                        if bool(re.search("\d", parts[0])) != bool(
                                re.search("\d", parts[1])):
                            # figure out which part contains the index
                            if bool(re.search("\d", parts[0])):
                                indexpart = parts[0]
                                textpart = parts[1]
                            else:
                                indexpart = parts[1]
                                textpart = parts[0]

                            match = re.match(
                                "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart
                            )  # remove odd characters from start and end of the text part
                            if match:
                                textpart = match.group(1)

                            # from Titleparts like: "Name of the series - Episode 2"	OK
                            match = re.match(
                                "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                indexpart)
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                if guessed_series is None:
                                    guessed_series = textpart
                                    title = textpart + " : Band " + guessed_series_index
                                else:
                                    title = textpart
                            else:
                                # from Titleparts like: "Episode 2 Name of the series"
                                match = re.match(
                                    "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$",
                                    indexpart)
                                if match:
                                    guessed_series_index = match.group(1)
                                    guessed_series = match.group(2)
                                    if guessed_series is None:
                                        guessed_series = textpart
                                        title = textpart + " : Band " + guessed_series_index
                                    else:
                                        title = textpart
                    elif len(parts) == 1:
                        # from Titles like: "Name of the series - Title (Episode 2)"
                        match = re.match(
                            "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                            parts[0])
                        if match:
                            guessed_series_index = match.group(3)
                            guessed_series = match.group(1)
                            title = match.group(2)

                        else:
                            # from Titles like: "Name of the series - Episode 2"
                            match = re.match(
                                "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                parts[0])
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                title = guessed_series + " : Band " + guessed_series_index

                    if guessed_series is not None and guessed_series_index is not None:
                        series = guessed_series
                        series_index = guessed_series_index
                        log.info("Guessed Series: %s" % series)
                        log.info("Guessed Series Index: %s" % series_index)

                # GND Subjects from 689
                for i in record.xpath(
                        ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    subjects_gnd.append(i.text.strip())
                # GND Subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        if i.text.startswith("("):
                            continue
                        subjects_gnd.append(i.text)
                log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))

                # Non-GND subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        # ignore entries starting with "(":
                        if i.text.startswith("("):
                            continue
                        subjects_non_gnd.extend(re.split(',|;', i.text))
                # remove one-character subjects:
                for i in subjects_non_gnd:
                    if len(i) < 2:
                        subjects_non_gnd.remove(i)
                log.info("Extracted non-GND Subjects: %s" %
                         " ".join(subjects_non_gnd))

                # Edition
                for i in record.xpath(
                        ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    edition = i.text.strip()
                    break
                log.info("Extracted Edition: %s" % edition)

                # Languages
                for i in record.xpath(
                        ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    languages.append(i.text.strip())
                if languages is not None:
                    log.info("Extracted Languages: %s" % ",".join(languages))

                # Put it all together
                if self.cfg_append_edition_to_title == True and edition is not None:
                    title = title + " : " + edition

                mi = Metadata(
                    self.removeSortingCharacters(title),
                    map(lambda i: self.removeSortingCharacters(i), authors))
                mi.title_sort = self.removeSortingCharacters(title_sort)
                mi.author_sort = self.removeSortingCharacters(author_sort)
                mi.languages = languages
                mi.pubdate = pubdate
                mi.publisher = " : ".join(
                    filter(None, [
                        publisher_location,
                        self.removeSortingCharacters(publisher_name)
                    ]))
                mi.series = self.removeSortingCharacters(series)
                mi.series_index = series_index
                mi.comments = comments
                mi.isbn = isbn  # also required for cover download
                mi.set_identifier('urn', urn)
                mi.set_identifier('dnb-idn', idn)
                mi.set_identifier('ddc', ",".join(ddc))

                if self.cfg_fetch_subjects == 0:
                    mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 1:
                    if len(subjects_gnd) > 0:
                        mi.tags = self.uniq(subjects_gnd)
                    else:
                        mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 2:
                    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
                elif self.cfg_fetch_subjects == 3:
                    if len(subjects_non_gnd) > 0:
                        mi.tags = self.uniq(subjects_non_gnd)
                    else:
                        mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 4:
                    mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 5:
                    mi.tags = []

                # put current result's metdata into result queue
                log.info("Final formatted result: %s" % mi)
                result_queue.put(mi)
 def set_mi(self, mi, fm):
     '''
     This sets the metadata for the test result books table. It doesn't reset
     the contents of the field selectors for editing rules.
     '''
     self.fm = fm
     if mi:
         if not isinstance(mi, list):
             mi = (mi, )
     else:
         mi = Metadata(_('Title'), [_('Author')])
         mi.author_sort = _('Author Sort')
         mi.series = ngettext('Series', 'Series', 1)
         mi.series_index = 3
         mi.rating = 4.0
         mi.tags = [_('Tag 1'), _('Tag 2')]
         mi.languages = ['eng']
         mi.id = 1
         if self.fm is not None:
             mi.set_all_user_metadata(self.fm.custom_field_metadata())
         else:
             # No field metadata. Grab a copy from the current library so
             # that we can validate any custom column names. The values for
             # the columns will all be empty, which in some very unusual
             # cases might cause formatter errors. We can live with that.
             from calibre.gui2.ui import get_gui
             fm = get_gui().current_db.new_api.field_metadata
             mi.set_all_user_metadata(fm.custom_field_metadata())
         for col in mi.get_all_user_metadata(False):
             if fm[col]['datatype'] == 'datetime':
                 mi.set(col, DEFAULT_DATE)
             elif fm[col]['datatype'] in ('int', 'float', 'rating'):
                 mi.set(col, 2)
             elif fm[col]['datatype'] == 'bool':
                 mi.set(col, False)
             elif fm[col]['is_multiple']:
                 mi.set(col, (col, ))
             else:
                 mi.set(col, col, 1)
         mi = (mi, )
     self.mi = mi
     tv = self.template_value
     tv.setColumnCount(2)
     tv.setHorizontalHeaderLabels((_('Book title'), _('Template value')))
     tv.horizontalHeader().setStretchLastSection(True)
     tv.horizontalHeader().sectionResized.connect(self.table_column_resized)
     tv.setRowCount(len(mi))
     # Set the height of the table
     h = tv.rowHeight(0) * min(len(mi), 5)
     h += 2 * tv.frameWidth() + tv.horizontalHeader().height()
     tv.setMinimumHeight(h)
     tv.setMaximumHeight(h)
     # Set the size of the title column
     if self.table_column_widths:
         tv.setColumnWidth(0, self.table_column_widths[0])
     else:
         tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10)
     tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
     tv.setRowCount(len(mi))
     # Use our own widget to get rid of elision. setTextElideMode() doesn't work
     for r in range(0, len(mi)):
         w = QLineEdit(tv)
         w.setReadOnly(True)
         tv.setCellWidget(r, 0, w)
         w = QLineEdit(tv)
         w.setReadOnly(True)
         tv.setCellWidget(r, 1, w)
     self.display_values('')
Beispiel #15
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry = XPath('//atom:entry')
    entry_id = XPath('descendant::atom:id')
    url = XPath('descendant::atom:link[@rel="self"]/@href')
    creator = XPath('descendant::dc:creator')
    identifier = XPath('descendant::dc:identifier')
    title = XPath('descendant::dc:title')
    date = XPath('descendant::dc:date')
    publisher = XPath('descendant::dc:publisher')
    subject = XPath('descendant::dc:subject')
    description = XPath('descendant::dc:description')
    language = XPath('descendant::dc:language')

    # print(etree.tostring(entry_, pretty_print=True))

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    details_url = url(entry_)[0]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google': google_id}
    try:
        raw = get_details(browser, details_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
                                               strip_encoding_pats=True)[0],
                                parser=etree.XMLParser(recover=True,
                                                       no_network=True,
                                                       resolve_entities=False))
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = type('')(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r' % pubdate)

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'
    ):
        mi.has_google_cover = x.get('href')
        break

    return mi
Beispiel #16
0
    def __init__(self,
                 parent,
                 text,
                 mi=None,
                 fm=None,
                 color_field=None,
                 icon_field_key=None,
                 icon_rule_kind=None,
                 doing_emblem=False,
                 text_is_placeholder=False,
                 dialog_is_st_editor=False,
                 global_vars=None,
                 all_functions=None,
                 builtin_functions=None):
        QDialog.__init__(self, parent)
        Ui_TemplateDialog.__init__(self)
        self.setupUi(self)

        self.coloring = color_field is not None
        self.iconing = icon_field_key is not None
        self.embleming = doing_emblem
        self.dialog_is_st_editor = dialog_is_st_editor
        if global_vars is None:
            self.global_vars = {}
        else:
            self.global_vars = global_vars

        cols = []
        if fm is not None:
            for key in sorted(
                    displayable_columns(fm),
                    key=lambda k: sort_key(fm[k]['name']
                                           if k != color_row_key else 0)):
                if key == color_row_key and not self.coloring:
                    continue
                from calibre.gui2.preferences.coloring import all_columns_string
                name = all_columns_string if key == color_row_key else fm[key][
                    'name']
                if name:
                    cols.append((name, key))

        self.color_layout.setVisible(False)
        self.icon_layout.setVisible(False)

        if self.coloring:
            self.color_layout.setVisible(True)
            for n1, k1 in cols:
                self.colored_field.addItem(
                    n1 + (' (' + k1 + ')' if k1 != color_row_key else ''), k1)
            self.colored_field.setCurrentIndex(
                self.colored_field.findData(color_field))
        elif self.iconing or self.embleming:
            self.icon_layout.setVisible(True)
            if self.embleming:
                self.icon_kind_label.setVisible(False)
                self.icon_kind.setVisible(False)
                self.icon_chooser_label.setVisible(False)
                self.icon_field.setVisible(False)

            for n1, k1 in cols:
                self.icon_field.addItem('{} ({})'.format(n1, k1), k1)
            self.icon_file_names = []
            d = os.path.join(config_dir, 'cc_icons')
            if os.path.exists(d):
                for icon_file in os.listdir(d):
                    icon_file = icu_lower(icon_file)
                    if os.path.exists(os.path.join(d, icon_file)):
                        if icon_file.endswith('.png'):
                            self.icon_file_names.append(icon_file)
            self.icon_file_names.sort(key=sort_key)
            self.update_filename_box()

            if self.iconing:
                dex = 0
                from calibre.gui2.preferences.coloring import icon_rule_kinds
                for i, tup in enumerate(icon_rule_kinds):
                    txt, val = tup
                    self.icon_kind.addItem(txt, userData=(val))
                    if val == icon_rule_kind:
                        dex = i
                self.icon_kind.setCurrentIndex(dex)
                self.icon_field.setCurrentIndex(
                    self.icon_field.findData(icon_field_key))

        if dialog_is_st_editor:
            self.buttonBox.setVisible(False)
        else:
            self.new_doc_label.setVisible(False)
            self.new_doc.setVisible(False)
            self.template_name_label.setVisible(False)
            self.template_name.setVisible(False)

        if mi:
            if not isinstance(mi, list):
                mi = (mi, )
        else:
            mi = Metadata(_('Title'), [_('Author')])
            mi.author_sort = _('Author Sort')
            mi.series = ngettext('Series', 'Series', 1)
            mi.series_index = 3
            mi.rating = 4.0
            mi.tags = [_('Tag 1'), _('Tag 2')]
            mi.languages = ['eng']
            mi.id = 1
            if fm is not None:
                mi.set_all_user_metadata(fm.custom_field_metadata())
            else:
                # No field metadata. Grab a copy from the current library so
                # that we can validate any custom column names. The values for
                # the columns will all be empty, which in some very unusual
                # cases might cause formatter errors. We can live with that.
                from calibre.gui2.ui import get_gui
                mi.set_all_user_metadata(get_gui(
                ).current_db.new_api.field_metadata.custom_field_metadata())
            for col in mi.get_all_user_metadata(False):
                mi.set(col, (col, ), 0)
            mi = (mi, )
        self.mi = mi

        # Set up the display table
        self.table_column_widths = None
        try:
            self.table_column_widths = \
                        gprefs.get('template_editor_table_widths', None)
        except:
            pass
        tv = self.template_value
        tv.setRowCount(len(mi))
        tv.setColumnCount(2)
        tv.setHorizontalHeaderLabels((_('Book title'), _('Template value')))
        tv.horizontalHeader().setStretchLastSection(True)
        tv.horizontalHeader().sectionResized.connect(self.table_column_resized)
        # Set the height of the table
        h = tv.rowHeight(0) * min(len(mi), 5)
        h += 2 * tv.frameWidth() + tv.horizontalHeader().height()
        tv.setMinimumHeight(h)
        tv.setMaximumHeight(h)
        # Set the size of the title column
        if self.table_column_widths:
            tv.setColumnWidth(0, self.table_column_widths[0])
        else:
            tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10)
        # Use our own widget to get rid of elision. setTextElideMode() doesn't work
        for r in range(0, len(mi)):
            w = QLineEdit(tv)
            w.setReadOnly(True)
            tv.setCellWidget(r, 0, w)
            w = QLineEdit(tv)
            w.setReadOnly(True)
            tv.setCellWidget(r, 1, w)
        tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)

        # Remove help icon on title bar
        icon = self.windowIcon()
        self.setWindowFlags(self.windowFlags()
                            & (~Qt.WindowType.WindowContextHelpButtonHint))
        self.setWindowIcon(icon)

        self.all_functions = all_functions if all_functions else formatter_functions(
        ).get_functions()
        self.builtins = (builtin_functions if builtin_functions else
                         formatter_functions().get_builtins_and_aliases())

        self.last_text = ''
        self.highlighter = TemplateHighlighter(self.textbox.document(),
                                               builtin_functions=self.builtins)
        self.textbox.cursorPositionChanged.connect(self.text_cursor_changed)
        self.textbox.textChanged.connect(self.textbox_changed)
        self.textbox.setFont(self.get_current_font())

        self.textbox.setTabStopWidth(10)
        self.source_code.setTabStopWidth(10)
        self.documentation.setReadOnly(True)
        self.source_code.setReadOnly(True)

        if text is not None:
            if text_is_placeholder:
                self.textbox.setPlaceholderText(text)
                self.textbox.clear()
                text = ''
            else:
                self.textbox.setPlainText(text)
        else:
            text = ''
        self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText(
            _('&OK'))
        self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText(
            _('&Cancel'))

        self.color_copy_button.clicked.connect(self.color_to_clipboard)
        self.filename_button.clicked.connect(self.filename_button_clicked)
        self.icon_copy_button.clicked.connect(self.icon_to_clipboard)

        try:
            with open(P('template-functions.json'), 'rb') as f:
                self.builtin_source_dict = json.load(f, encoding='utf-8')
        except:
            self.builtin_source_dict = {}

        func_names = sorted(self.all_functions)
        self.function.clear()
        self.function.addItem('')
        for f in func_names:
            self.function.addItem(
                '{}  --  {}'.format(
                    f, self.function_type_string(f, longform=False)), f)
        self.function.setCurrentIndex(0)
        self.function.currentIndexChanged.connect(self.function_changed)
        self.display_values(text)
        self.rule = (None, '')

        tt = _('Template language tutorial')
        self.template_tutorial.setText(
            '<a href="%s">%s</a>' % (localize_user_manual_link(
                'https://manual.calibre-ebook.com/template_lang.html'), tt))
        tt = _('Template function reference')
        self.template_func_reference.setText(
            '<a href="%s">%s</a>' % (localize_user_manual_link(
                'https://manual.calibre-ebook.com/generated/en/template_ref.html'
            ), tt))

        s = gprefs.get('template_editor_break_on_print', False)
        self.go_button.setEnabled(s)
        self.remove_all_button.setEnabled(s)
        self.set_all_button.setEnabled(s)
        self.toggle_button.setEnabled(s)
        self.breakpoint_line_box.setEnabled(s)
        self.breakpoint_line_box_label.setEnabled(s)
        self.break_box.setChecked(s)
        self.break_box.stateChanged.connect(self.break_box_changed)
        self.go_button.clicked.connect(self.go_button_pressed)
        self.textbox.setFocus()
        self.set_up_font_boxes()
        self.toggle_button.clicked.connect(self.toggle_button_pressed)
        self.remove_all_button.clicked.connect(self.remove_all_button_pressed)
        self.set_all_button.clicked.connect(self.set_all_button_pressed)

        self.load_button.clicked.connect(self.load_template)
        self.save_button.clicked.connect(self.save_template)

        self.textbox.setWordWrapMode(QTextOption.WordWrap)
        self.textbox.setContextMenuPolicy(
            Qt.ContextMenuPolicy.CustomContextMenu)
        self.textbox.customContextMenuRequested.connect(self.show_context_menu)
        # Now geometry
        try:
            geom = gprefs.get('template_editor_dialog_geometry', None)
            if geom is not None:
                QApplication.instance().safe_restore_geometry(
                    self, QByteArray(geom))
        except Exception:
            pass
Beispiel #17
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry = XPath('//atom:entry')
    entry_id = XPath('descendant::atom:id')
    title = XPath('descendant::atom:title')
    description = XPath('descendant::atom:summary')
    publisher = XPath("descendant::db:attribute[@name='publisher']")
    isbn = XPath("descendant::db:attribute[@name='isbn13']")
    date = XPath("descendant::db:attribute[@name='pubdate']")
    creator = XPath("descendant::db:attribute[@name='author']")
    booktag = XPath("descendant::db:tag/attribute::name")
    rating = XPath("descendant::gd:rating/attribute::average")
    cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban': douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(
            xml_to_unicode(clean_ascii_chars(raw),
                           strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r' % pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/')
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi
Beispiel #18
0
    def __init__(self, parent, text, mi=None, fm=None, color_field=None,
                 icon_field_key=None, icon_rule_kind=None, doing_emblem=False,
                 text_is_placeholder=False, dialog_is_st_editor=False,
                 global_vars={}, all_functions=None, builtin_functions=None):
        QDialog.__init__(self, parent)
        Ui_TemplateDialog.__init__(self)
        self.setupUi(self)

        self.coloring = color_field is not None
        self.iconing = icon_field_key is not None
        self.embleming = doing_emblem
        self.dialog_is_st_editor = dialog_is_st_editor
        self.global_vars = global_vars

        cols = []
        if fm is not None:
            for key in sorted(displayable_columns(fm),
                              key=lambda k: sort_key(fm[k]['name'] if k != color_row_key else 0)):
                if key == color_row_key and not self.coloring:
                    continue
                from calibre.gui2.preferences.coloring import all_columns_string
                name = all_columns_string if key == color_row_key else fm[key]['name']
                if name:
                    cols.append((name, key))

        self.color_layout.setVisible(False)
        self.icon_layout.setVisible(False)

        if self.coloring:
            self.color_layout.setVisible(True)
            for n1, k1 in cols:
                self.colored_field.addItem(n1 +
                       (' (' + k1 + ')' if k1 != color_row_key else ''), k1)
            self.colored_field.setCurrentIndex(self.colored_field.findData(color_field))
        elif self.iconing or self.embleming:
            self.icon_layout.setVisible(True)
            if self.embleming:
                self.icon_kind_label.setVisible(False)
                self.icon_kind.setVisible(False)
                self.icon_chooser_label.setVisible(False)
                self.icon_field.setVisible(False)

            for n1, k1 in cols:
                self.icon_field.addItem('{} ({})'.format(n1, k1), k1)
            self.icon_file_names = []
            d = os.path.join(config_dir, 'cc_icons')
            if os.path.exists(d):
                for icon_file in os.listdir(d):
                    icon_file = icu_lower(icon_file)
                    if os.path.exists(os.path.join(d, icon_file)):
                        if icon_file.endswith('.png'):
                            self.icon_file_names.append(icon_file)
            self.icon_file_names.sort(key=sort_key)
            self.update_filename_box()

            if self.iconing:
                dex = 0
                from calibre.gui2.preferences.coloring import icon_rule_kinds
                for i,tup in enumerate(icon_rule_kinds):
                    txt,val = tup
                    self.icon_kind.addItem(txt, userData=(val))
                    if val == icon_rule_kind:
                        dex = i
                self.icon_kind.setCurrentIndex(dex)
                self.icon_field.setCurrentIndex(self.icon_field.findData(icon_field_key))

        if dialog_is_st_editor:
            self.buttonBox.setVisible(False)
        else:
            self.new_doc_label.setVisible(False)
            self.new_doc.setVisible(False)
            self.template_name_label.setVisible(False)
            self.template_name.setVisible(False)

        if mi:
            self.mi = mi
        else:
            self.mi = Metadata(_('Title'), [_('Author')])
            self.mi.author_sort = _('Author Sort')
            self.mi.series = ngettext('Series', 'Series', 1)
            self.mi.series_index = 3
            self.mi.rating = 4.0
            self.mi.tags = [_('Tag 1'), _('Tag 2')]
            self.mi.languages = ['eng']
            self.mi.id = 1
            if fm is not None:
                self.mi.set_all_user_metadata(fm.custom_field_metadata())
            else:
                # No field metadata. Grab a copy from the current library so
                # that we can validate any custom column names. The values for
                # the columns will all be empty, which in some very unusual
                # cases might cause formatter errors. We can live with that.
                from calibre.gui2.ui import get_gui
                self.mi.set_all_user_metadata(
                      get_gui().current_db.new_api.field_metadata.custom_field_metadata())
            for col in self.mi.get_all_user_metadata(False):
                self.mi.set(col, (col,), 0)

        # Remove help icon on title bar
        icon = self.windowIcon()
        self.setWindowFlags(self.windowFlags()&(~Qt.WindowType.WindowContextHelpButtonHint))
        self.setWindowIcon(icon)

        self.all_functions = all_functions if all_functions else formatter_functions().get_functions()
        self.builtins = builtin_functions if builtin_functions else formatter_functions().get_builtins()

        self.last_text = ''
        self.highlighter = TemplateHighlighter(self.textbox.document(), builtin_functions=self.builtins)
        self.textbox.cursorPositionChanged.connect(self.text_cursor_changed)
        self.textbox.textChanged.connect(self.textbox_changed)

        self.textbox.setTabStopWidth(10)
        self.source_code.setTabStopWidth(10)
        self.documentation.setReadOnly(True)
        self.source_code.setReadOnly(True)

        if text is not None:
            if text_is_placeholder:
                self.textbox.setPlaceholderText(text)
                self.textbox.clear()
            else:
                self.textbox.setPlainText(text)
        self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText(_('&OK'))
        self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText(_('&Cancel'))
        self.color_copy_button.clicked.connect(self.color_to_clipboard)
        self.filename_button.clicked.connect(self.filename_button_clicked)
        self.icon_copy_button.clicked.connect(self.icon_to_clipboard)

        try:
            with open(P('template-functions.json'), 'rb') as f:
                self.builtin_source_dict = json.load(f, encoding='utf-8')
        except:
            self.builtin_source_dict = {}

        func_names = sorted(self.all_functions)
        self.function.clear()
        self.function.addItem('')
        for f in func_names:
            self.function.addItem('{}  --  {}'.format(f,
                               self.function_type_string(f, longform=False)), f)
        self.function.setCurrentIndex(0)
        self.function.currentIndexChanged.connect(self.function_changed)
        self.textbox_changed()
        self.rule = (None, '')

        tt = _('Template language tutorial')
        self.template_tutorial.setText(
            '<a href="%s">%s</a>' % (
                localize_user_manual_link('https://manual.calibre-ebook.com/template_lang.html'), tt))
        tt = _('Template function reference')
        self.template_func_reference.setText(
            '<a href="%s">%s</a>' % (
                localize_user_manual_link('https://manual.calibre-ebook.com/generated/en/template_ref.html'), tt))

        self.font_size_box.setValue(gprefs['gpm_template_editor_font_size'])
        self.font_size_box.valueChanged.connect(self.font_size_changed)
Beispiel #19
0
    def test_legacy_adding_books(self):  # {{{
        'Test various adding/deleting books methods'
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.ptempfile import TemporaryFile
        legacy, old = self.init_legacy(self.cloned_library), self.init_old(
            self.cloned_library)
        mi = Metadata('Added Book0', authors=('Added Author', ))
        with TemporaryFile(suffix='.aff') as name:
            with open(name, 'wb') as f:
                f.write(b'xxx')
            T = partial(ET,
                        'add_books', ([name], ['AFF'], [mi]),
                        old=old,
                        legacy=legacy)
            T()(self)
            book_id = T(kwargs={'return_ids': True})(self)[1][0]
            self.assertEqual(legacy.new_api.formats(book_id), ('AFF', ))
            T(kwargs={'add_duplicates': False})(self)
            mi.title = 'Added Book1'
            mi.uuid = 'uuu'
            T = partial(ET,
                        'import_book', (mi, [name]),
                        old=old,
                        legacy=legacy)
            book_id = T()(self)
            self.assertNotEqual(legacy.uuid(book_id, index_is_id=True),
                                old.uuid(book_id, index_is_id=True))
            book_id = T(kwargs={'preserve_uuid': True})(self)
            self.assertEqual(legacy.uuid(book_id, index_is_id=True),
                             old.uuid(book_id, index_is_id=True))
            self.assertEqual(legacy.new_api.formats(book_id), ('AFF', ))

            T = partial(ET, 'add_format', old=old, legacy=legacy)
            T((0, 'AFF', BytesIO(b'fffff')))(self)
            T((0, 'AFF', BytesIO(b'fffff')))(self)
            T((0, 'AFF', BytesIO(b'fffff')), {'replace': True})(self)
        with TemporaryFile(suffix='.opf') as name:
            with open(name, 'wb') as f:
                f.write(b'zzzz')
            T = partial(ET,
                        'import_book', (mi, [name]),
                        old=old,
                        legacy=legacy)
            book_id = T()(self)
            self.assertFalse(legacy.new_api.formats(book_id))

        mi.title = 'Added Book2'
        T = partial(ET, 'create_book_entry', (mi, ), old=old, legacy=legacy)
        T()
        T({'add_duplicates': False})
        T({'force_id': 1000})

        with TemporaryFile(suffix='.txt') as name:
            with open(name, 'wb') as f:
                f.write(b'tttttt')
            bid = legacy.add_catalog(name, 'My Catalog')
            self.assertEqual(old.add_catalog(name, 'My Catalog'), bid)
            cache = legacy.new_api
            self.assertEqual(cache.formats(bid), ('TXT', ))
            self.assertEqual(cache.field_for('title', bid), 'My Catalog')
            self.assertEqual(cache.field_for('authors', bid), ('calibre', ))
            self.assertEqual(cache.field_for('tags', bid), (_('Catalog'), ))
            self.assertTrue(bid < legacy.add_catalog(name, 'Something else'))
            self.assertEqual(legacy.add_catalog(name, 'My Catalog'), bid)
            self.assertEqual(old.add_catalog(name, 'My Catalog'), bid)

            bid = legacy.add_news(
                name, {
                    'title': 'Events',
                    'add_title_tag': True,
                    'custom_tags': ('one', 'two')
                })
            self.assertEqual(cache.formats(bid), ('TXT', ))
            self.assertEqual(cache.field_for('authors', bid), ('calibre', ))
            self.assertEqual(cache.field_for('tags', bid),
                             (_('News'), 'Events', 'one', 'two'))

        self.assertTrue(legacy.cover(1, index_is_id=True))
        origcov = legacy.cover(1, index_is_id=True)
        self.assertTrue(legacy.has_cover(1))
        legacy.remove_cover(1)
        self.assertFalse(legacy.has_cover(1))
        self.assertFalse(legacy.cover(1, index_is_id=True))
        legacy.set_cover(3, origcov)
        self.assertEqual(legacy.cover(3, index_is_id=True), origcov)
        self.assertTrue(legacy.has_cover(3))

        self.assertTrue(legacy.format(1, 'FMT1', index_is_id=True))
        legacy.remove_format(1, 'FMT1', index_is_id=True)
        self.assertIsNone(legacy.format(1, 'FMT1', index_is_id=True))

        legacy.delete_book(1)
        old.delete_book(1)
        self.assertNotIn(1, legacy.all_ids())
        legacy.dump_metadata((2, 3))
        old.close()
Beispiel #20
0
import json
from calibre.utils.logging import Log
from calibre.ebooks.metadata.book.base import Metadata
from calibre_plugins.crossref_doi_download import DoiMeta
from calibre_plugins.crossref_doi_download.doi_reader import DoiReader, get_title, get_author_list
reader = DoiReader(Log())
dm = DoiMeta('./plugin/')
br = dm.browser

fullurl = 'https://api.crossref.org/works?query.bibliographic=Bayesian+data+analysis+Andrew+Gelman&mailto=vikoya5988%40oniaj.com'
cdata = br.open(fullurl).read()

data = json.loads(cdata)
message = data['message']
results = message['items']

identifiers = {}
# fin = map(lambda x:reader.result2meta(x,identifiers),results)

for r in results:
    reader.result2meta(r,identifiers)

result = results[1]
title = get_title(result)
authors = get_author_list(result)
mi = Metadata(title=title, authors=authors)

from calibre import ipython
ipython(locals())
Beispiel #21
0
    def test_sorting(self):  # {{{
        'Test sorting'
        cache = self.init_cache()
        ae = self.assertEqual
        for field, order in {
                'title': [2, 1, 3],
                'authors': [2, 1, 3],
                'series': [3, 1, 2],
                'tags': [3, 1, 2],
                'rating': [3, 2, 1],
                # 'identifiers': [3, 2, 1], There is no stable sort since 1 and
                # 2 have the same identifier keys
                # 'last_modified': [3, 2, 1], There is no stable sort as two
                # records have the exact same value
                'timestamp': [2, 1, 3],
                'pubdate': [1, 2, 3],
                'publisher': [3, 2, 1],
                'languages': [3, 2, 1],
                'comments': [3, 2, 1],
                '#enum': [3, 2, 1],
                '#authors': [3, 2, 1],
                '#date': [3, 1, 2],
                '#rating': [3, 2, 1],
                '#series': [3, 2, 1],
                '#tags': [3, 2, 1],
                '#yesno': [2, 1, 3],
                '#comments': [3, 2, 1],
                'id': [1, 2, 3],
        }.iteritems():
            x = list(reversed(order))
            ae(order, cache.multisort([(field, True)], ids_to_sort=x),
               'Ascending sort of %s failed' % field)
            ae(x, cache.multisort([(field, False)], ids_to_sort=order),
               'Descending sort of %s failed' % field)

        # Test sorting of is_multiple fields.

        # Author like fields should be sorted by generating sort names from the
        # actual values in entry order
        for field in ('authors', '#authors'):
            ae(
                cache.set_field(
                    field, {
                        1: ('aa bb', 'bb cc', 'cc dd'),
                        2: ('bb aa', 'xx yy'),
                        3: ('aa bb', 'bb aa')
                    }), {1, 2, 3})
            ae([2, 3, 1],
               cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
            ae([1, 3, 2],
               cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))

        # All other is_multiple fields should be sorted by sorting the values
        # for each book and using that as the sort key
        for field in ('tags', '#tags'):
            ae(
                cache.set_field(field, {
                    1: ('b', 'a'),
                    2: ('c', 'y'),
                    3: ('b', 'z')
                }), {1, 2, 3})
            ae([1, 3, 2],
               cache.multisort([(field, True)], ids_to_sort=(1, 2, 3)))
            ae([2, 3, 1],
               cache.multisort([(field, False)], ids_to_sort=(1, 2, 3)))

        # Test tweak to sort dates by visible format
        from calibre.utils.date import parse_only_date as p
        from calibre.utils.config_base import Tweak
        ae(
            cache.set_field('pubdate', {
                1: p('2001-3-3'),
                2: p('2002-2-3'),
                3: p('2003-1-3')
            }), {1, 2, 3})
        ae([1, 2, 3], cache.multisort([('pubdate', True)]))
        with Tweak('gui_pubdate_display_format',
                   'MMM'), Tweak('sort_dates_using_visible_fields', True):
            c2 = self.init_cache()
            ae([3, 2, 1], c2.multisort([('pubdate', True)]))

        # Test bool sorting when not tristate
        cache.set_pref('bools_are_tristate', False)
        c2 = self.init_cache()
        ae([2, 3, 1], c2.multisort([('#yesno', True), ('id', False)]))

        # Test subsorting
        ae([3, 2, 1], cache.multisort([('identifiers', True),
                                       ('title', True)]), 'Subsort failed')
        from calibre.ebooks.metadata.book.base import Metadata
        for i in xrange(7):
            cache.create_book_entry(Metadata('title%d' % i),
                                    apply_import_tags=False)
        cache.create_custom_column('one', 'CC1', 'int', False)
        cache.create_custom_column('two', 'CC2', 'int', False)
        cache.create_custom_column('three', 'CC3', 'int', False)
        cache.close()
        cache = self.init_cache()
        cache.set_field('#one', {(i + (5 * m)): m
                                 for m in (0, 1) for i in xrange(1, 6)})
        cache.set_field('#two',
                        {i + (m * 3): m
                         for m in (0, 1, 2) for i in (1, 2, 3)})
        cache.set_field('#two', {10: 2})
        cache.set_field('#three', {i: i for i in xrange(1, 11)})
        ae(
            list(xrange(1, 11)),
            cache.multisort([('#one', True), ('#two', True)],
                            ids_to_sort=sorted(cache.all_book_ids())))
        ae([4, 5, 1, 2, 3, 7, 8, 9, 10, 6],
           cache.multisort([('#one', True), ('#two', False)],
                           ids_to_sort=sorted(cache.all_book_ids())))
        ae([5, 4, 3, 2, 1, 10, 9, 8, 7, 6],
           cache.multisort([('#one', True), ('#two', False),
                            ('#three', False)],
                           ids_to_sort=sorted(cache.all_book_ids())))
Beispiel #22
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title.startswith(r'\376\377'):
        # corrupted XMP packet generated by Nitro PDF. See
        # https://bugs.launchpad.net/calibre/+bug/1541981
        raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:'+x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #23
0
    def to_metadata(self, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        log.info("to_metadata")
        douban_id = entry_.get("id")
        title = entry_.get("title")
        description = entry_.get("summary")
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get("publisher")
        isbn = entry_.get("isbn13")  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get("pubdate")
        authors = entry_.get("author")
        # authors = "author"
        book_tags = entry_.get("tags")
        rating = entry_.get("rating")
        cover_url = entry_.get("cover")
        series = entry_.get("series")

        if not authors:
            authors = [("Unknown")]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {"douban": douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(""), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = book_tags

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except BaseException:
                log.error("Failed to parse pubdate %r" % pubdate)

        if rating:
            try:
                # mi.publisher += "#PrB.rating#" + str(rating)
                mi.rating = rating / 2.0
            except BaseException:
                log.exception("Failed to parse rating")
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find("book-default") == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series

        return mi
Beispiel #24
0
def read_metadata_kfx(stream, read_cover=True):
    ' Read the metadata.kfx file that is found in the sdr book folder for KFX files '
    c = Container(stream.read())
    m = extract_metadata(c.decode())

    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ''
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get('title') or _('Unknown')
    authors = get('author', False) or [_('Unknown')]
    auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$')

    def fix_author(x):
        if tweaks['author_sort_copy_method'] != 'copy':
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + ' ' + m.group(1)
        return x

    unique_authors = []  # remove duplicates while retaining order
    for f in [fix_author(x) for x in authors]:
        if f not in unique_authors:
            unique_authors.append(f)

    mi = Metadata(title, unique_authors)
    if has('author'):
        mi.author_sort = get('author')
    if has('ASIN'):
        mi.set_identifier('mobi-asin', get('ASIN'))
    elif has('content_id'):
        mi.set_identifier('mobi-asin', get('content_id'))
    if has('languages'):
        langs = list(
            filter(None,
                   (canonicalize_lang(x) for x in get('languages', False))))
        if langs:
            mi.languages = langs
    if has('issue_date'):
        try:
            mi.pubdate = parse_only_date(get('issue_date'))
        except Exception:
            pass
    if has('publisher') and get('publisher') != 'Unknown':
        mi.publisher = get('publisher')
    if read_cover and m[COVER_KEY]:
        try:
            data = from_base64_bytes(m[COVER_KEY])
            fmt, w, h = identify(data)
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w > -1 and h > -1:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #25
0
 def mi(self):
     mi = Metadata(unicode_type(self.title.text()).strip() or _('Unknown'))
     mi.authors = string_to_authors(unicode_type(self.authors.text()).strip()) or [_('Unknown')]
     mi.languages = self.languages.lang_codes or [get_lang()]
     return mi
Beispiel #26
0
    def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers):
        from lxml import etree

        def tostring(x):
            if x is None:
                return ''
            return etree.tostring(x, method='text', encoding=unicode).strip()

        orig_isbn = identifiers.get('isbn', None)
        title_tokens = list(self.get_title_tokens(orig_title))
        author_tokens = list(self.get_author_tokens(orig_authors))
        results = []

        def ismatch(title, authors):
            authors = lower(' '.join(authors))
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            amatch = not author_tokens
            for a in author_tokens:
                if lower(a) in authors:
                    amatch = True
                    break
            if not author_tokens: amatch = True
            return match and amatch

        bl = feed.find('BookList')
        if bl is None:
            err = tostring(feed.find('errormessage'))
            raise ValueError('ISBNDb query failed:' + err)
        total_results = int(bl.get('total_results'))
        shown_results = int(bl.get('shown_results'))
        for bd in bl.xpath('.//BookData'):
            isbn = check_isbn(bd.get('isbn', None))
            isbn13 = check_isbn(bd.get('isbn13', None))
            if not isbn and not isbn13:
                continue
            if orig_isbn and orig_isbn not in {isbn, isbn13}:
                continue
            title = tostring(bd.find('Title'))
            if not title:
                continue
            authors = []
            for au in bd.xpath('.//Authors/Person'):
                au = tostring(au)
                if au:
                    if ',' in au:
                        ln, _, fn = au.partition(',')
                        au = fn.strip() + ' ' + ln.strip()
                authors.append(au)
            if not authors:
                continue
            comments = tostring(bd.find('Summary'))
            id_ = (title, tuple(authors))
            if id_ in seen:
                continue
            seen.add(id_)
            if not ismatch(title, authors):
                continue
            publisher = tostring(bd.find('PublisherText'))
            if not publisher: publisher = None
            if publisher and 'audio' in publisher.lower():
                continue
            mi = Metadata(title, authors)
            mi.isbn = isbn
            mi.publisher = publisher
            mi.comments = comments
            results.append(mi)
        return total_results, shown_results, results
Beispiel #27
0
    def merge(self, results, min_year, do_asr=True):
        ans = Metadata(_('Unknown'))

        # We assume the shortest title has the least cruft in it
        ans.title = self.length_merge('title', results, null_value=ans.title)

        # No harm in having extra authors, maybe something useful like an
        # editor or translator
        ans.authors = self.length_merge('authors',
                                        results,
                                        null_value=ans.authors,
                                        shortest=False)

        # We assume the shortest publisher has the least cruft in it
        ans.publisher = self.length_merge('publisher',
                                          results,
                                          null_value=ans.publisher)

        # We assume the smallest set of tags has the least cruft in it
        ans.tags = self.length_merge('tags',
                                     results,
                                     null_value=ans.tags,
                                     shortest=msprefs['fewer_tags'])

        # We assume the longest series has the most info in it
        ans.series = self.length_merge('series',
                                       results,
                                       null_value=ans.series,
                                       shortest=False)
        for r in results:
            if r.series and r.series == ans.series:
                ans.series_index = r.series_index
                break

        # Average the rating over all sources
        ratings = []
        for r in results:
            rating = r.rating
            if rating and rating > 0 and rating <= 5:
                ratings.append(rating)
        if ratings:
            ans.rating = int(round(sum(ratings) / len(ratings)))

        # Smallest language is likely to be valid
        ans.language = self.length_merge('language',
                                         results,
                                         null_value=ans.language)

        # Choose longest comments
        ans.comments = self.length_merge('comments',
                                         results,
                                         null_value=ans.comments,
                                         shortest=False)

        # Published date
        if min_year:
            for r in results:
                year = getattr(r.pubdate, 'year', None)
                if year == min_year:
                    ans.pubdate = r.pubdate
                    break
            if getattr(ans.pubdate, 'year', None) == min_year:
                min_date = datetime(min_year,
                                    ans.pubdate.month,
                                    ans.pubdate.day,
                                    tzinfo=utc_tz)
            else:
                min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
            ans.pubdate = min_date
        else:
            min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
            for r in results:
                if r.pubdate is not None:
                    candidate = as_utc(r.pubdate)
                    if candidate < min_date:
                        min_date = candidate
            if min_date.year < 3000:
                ans.pubdate = min_date

        # Identifiers
        for r in results:
            ans.identifiers.update(r.identifiers)

        # Cover URL
        ans.has_cached_cover_url = bool(
            [r for r in results if getattr(r, 'has_cached_cover_url', False)])

        # Merge any other fields with no special handling (random merge)
        touched_fields = set()
        for r in results:
            if hasattr(r, 'identify_plugin'):
                touched_fields |= r.identify_plugin.touched_fields

        for f in touched_fields:
            if f.startswith('identifier:') or not ans.is_null(f):
                continue
            setattr(ans, f,
                    self.random_merge(f, results, null_value=getattr(ans, f)))

        if do_asr:
            avg = [x.relevance_in_source for x in results]
            avg = sum(avg) / len(avg)
            ans.average_source_relevance = avg

        return ans
Beispiel #28
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags = parse_comment_tags(src)
    meta_tags = parse_meta_tags(src)

    def get(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = ans.strip()
        if not ans:
            ans = None
        return ans

    # Title
    title = get('title')
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = replace_entities(match.group(1))

    # Author
    authors = get('authors') or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title or _('Unknown'), string_to_authors(authors))

    for field in ('publisher', 'isbn', 'language', 'comments'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get('tags')
    if tags:
        tags = [x.strip() for x in tags.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    return mi
Beispiel #29
0
 def start(self, title, authors, identifiers):
     book = Metadata(title, authors)
     book.identifiers = identifiers
     self.covers_widget.start(book, self.current_cover, title, authors, {})
     return self.exec_()
Beispiel #30
0
 def validate(self, x):
     from calibre.ebooks.metadata.book.base import Metadata
     self.book = Metadata('')
     return self.vformat(x, [], {})