def convert(self, oeb_book, output, input_plugin, opts, log):
        """Convert from calibre's internal format to KePub."""
        self.epub_output_plugin.convert(oeb_book, output, input_plugin, opts, log)
        container = KEPubContainer(output, default_log)

        if container.is_drm_encumbered:
            return

        # Write the details file
        o = {
            "kepub_output_version": ".".join([str(n) for n in self.version]),
            "kepub_output_currenttime": datetime.utcnow().ctime(),
        }
        kte_data_file = self.temporary_file("_KePubOutputPluginInfo")
        kte_data_file.write(json.dumps(o))
        kte_data_file.close()
        container.copy_file_to_container(
            kte_data_file.name, name="plugininfo.kte", mt="application/json"
        )

        title = container.opf_xpath("./opf:metadata/dc:title/text()")
        if len(title) > 0:
            title = title[0]
        else:
            title = NULL_VALUES["title"]
        authors = container.opf_xpath(
            './opf:metadata/dc:creator[@opf:role="aut"]/text()'
        )
        if len(authors) < 1:
            authors = NULL_VALUES["authors"]
        mi = Metadata(title, authors)
        language = container.opf_xpath("./opf:metadata/dc:language/text()")
        if len(language) > 0:
            mi.languages = language
            language = language[0]
        else:
            mi.languages = NULL_VALUES["languages"]
            language = NULL_VALUES["language"]
        mi.language

        modify_epub(
            container,
            output,
            metadata=mi,
            opts={
                "clean_markup": opts.kepub_clean_markup,
                "hyphenate": opts.kepub_hyphenate,
                "no-hyphens": opts.kepub_disable_hyphenation,
                "smarten_punctuation": False,
                "extended_kepub_features": True,
            },
        )
Beispiel #2
0
    def convert(self, oeb_book, output, input_plugin, opts, log):
        self.epub_output_plugin.convert(oeb_book, output, input_plugin, opts,
                                        log)
        container = KEPubContainer(output, default_log)

        if container.is_drm_encumbered:
            return

        # Write the details file
        o = {
            'kepub_output_version': ".".join([str(n) for n in self.version]),
            'kepub_output_currenttime': datetime.utcnow().ctime()
        }
        kte_data_file = self.temporary_file('_KePubOutputPluginInfo')
        kte_data_file.write(json.dumps(o))
        kte_data_file.close()
        container.copy_file_to_container(kte_data_file.name,
                                         name='plugininfo.kte',
                                         mt='application/json')

        title = container.opf_xpath("./opf:metadata/dc:title/text()")
        if len(title) > 0:
            title = title[0]
        else:
            title = NULL_VALUES['title']
        authors = container.opf_xpath(
            './opf:metadata/dc:creator[@opf:role="aut"]/text()')
        if len(authors) < 1:
            authors = NULL_VALUES['authors']
        mi = Metadata(title, authors)
        language = container.opf_xpath("./opf:metadata/dc:language/text()")
        if len(language) > 0:
            mi.languages = language
            language = language[0]
        else:
            mi.languages = NULL_VALUES['languages']
            language = NULL_VALUES['language']
        mi.language

        modify_epub(container,
                    output,
                    metadata=mi,
                    opts={
                        'clean_markup': opts.kepub_clean_markup,
                        'hyphenate': opts.kepub_hyphenate,
                        'no-hyphens': opts.kepub_disable_hyphenation,
                        'replace_lang': opts.kepub_replace_lang,
                        'smarten_punctuation': False,
                        'extended_kepub_features': True
                    })
Beispiel #3
0
    def parse(self, xml_detail, xml_more_info):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_more_info)
        publisher = self.parse_publisher(xml_detail)
        tags = self.parse_tags(xml_detail, xml_more_info)
        serie, serie_index = self.parse_serie(xml_detail)
        pub_year = self.parse_pub_year(xml_detail, xml_more_info)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(as_unicode(title), authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            self.log('Result skipped for because title or authors not found')
            return None
Beispiel #4
0
    def parse(self, xml_detail):
        data = xml_detail.split('\n')[1].split("|")
        self.log(data)

        title = data[1]
        authors = [data[0]]
        comments = data[13]
        isbn = data[3]
        publisher = data[6]
        pub_date_tmp = data[34].split('-')
        pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz)
        if isbn is not None:
            isbn_tmp = re.sub("-", "", isbn)
            cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp)
        else:
            cover = None

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.publisher = publisher
            mi.pubdate = pub_date
            mi.isbn = isbn
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            return None
Beispiel #5
0
def read_metadata(root):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key != 'uuid':
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    return ans
Beispiel #6
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_detail)
        publisher = self.parse_publisher(xml_detail)
        pub_year = self.parse_pubdate(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:str(self.number)}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(str(self.number), cover)

            return mi
        else:
            return None
Beispiel #7
0
def read_metadata_kfx(stream, read_cover=True):
    ' Read the metadata.kfx file that is found in the sdr book folder for KFX files '
    c = Container(stream.read())
    m = extract_metadata(c.decode())
    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ''
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get('title') or _('Unknown')
    authors = get('authors', False) or [_('Unknown')]
    auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$')

    def fix_author(x):
        if tweaks['author_sort_copy_method'] != 'copy':
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + ' ' + m.group(1)
        return x

    mi = Metadata(title, [fix_author(x) for x in authors])
    if has('author'):
        mi.author_sort = get('author')
    if has('ASIN'):
        mi.set_identifier('mobi-asin', get('ASIN'))
    elif has('content_id'):
        mi.set_identifier('mobi-asin', get('content_id'))
    if has('languages'):
        langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False))))
        if langs:
            mi.languages = langs
    if has('issue_date'):
        try:
            mi.pubdate = parse_only_date(get('issue_date'))
        except Exception:
            pass
    if has('publisher') and get('publisher') != 'Unknown':
        mi.publisher = get('publisher')
    if read_cover and m[COVER_KEY]:
        try:
            data = base64.standard_b64decode(m[COVER_KEY])
            w, h, fmt = identify_data(data)
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w and h:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #8
0
def read_metadata_kfx(stream, read_cover=True):
    " Read the metadata.kfx file that is found in the sdr book folder for KFX files "
    c = Container(stream.read())
    m = extract_metadata(c.decode())
    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ""
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get("title") or _("Unknown")
    authors = get("authors", False) or [_("Unknown")]
    auth_pat = re.compile(r"([^,]+?)\s*,\s+([^,]+)$")

    def fix_author(x):
        if tweaks["author_sort_copy_method"] != "copy":
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + " " + m.group(1)
        return x

    mi = Metadata(title, [fix_author(x) for x in authors])
    if has("author"):
        mi.author_sort = get("author")
    if has("ASIN"):
        mi.set_identifier("mobi-asin", get("ASIN"))
    elif has("content_id"):
        mi.set_identifier("mobi-asin", get("content_id"))
    if has("languages"):
        langs = list(filter(None, (canonicalize_lang(x) for x in get("languages", False))))
        if langs:
            mi.languages = langs
    if has("issue_date"):
        try:
            mi.pubdate = parse_only_date(get("issue_date"))
        except Exception:
            pass
    if has("publisher") and get("publisher") != "Unknown":
        mi.publisher = get("publisher")
    if read_cover and m[COVER_KEY]:
        try:
            data = base64.standard_b64decode(m[COVER_KEY])
            fmt, w, h = identify(bytes(data))
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w > -1 and h > -1:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #9
0
def read_metadata(root, ver=None, return_extra_data=False):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in iteritems(identifiers):
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key == 'uuid':
            ans.uuid = vals[0]
        else:
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        if bkp[0]:
            ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    ans.comments = read_comments(root, prefixes, refines) or ans.comments
    ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
    ans.tags = read_tags(root, prefixes, refines) or ans.tags
    ans.rating = read_rating(root, prefixes, refines) or ans.rating
    s, si = read_series(root, prefixes, refines)
    if s:
        ans.series, ans.series_index = s, si
    ans.author_link_map = read_author_link_map(root, prefixes,
                                               refines) or ans.author_link_map
    ans.user_categories = read_user_categories(root, prefixes,
                                               refines) or ans.user_categories
    for name, fm in iteritems(
            read_user_metadata(root, prefixes, refines) or {}):
        ans.set_user_metadata(name, fm)
    if return_extra_data:
        ans = ans, ver, read_raster_cover(root, prefixes,
                                          refines), first_spine_item(
                                              root, prefixes, refines)
    return ans
Beispiel #10
0
 def default_mi(self):
     from calibre.ebooks.metadata.book.base import Metadata
     mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')])
     mi.series = _('A series of samples')
     mi.series_index = 4
     mi.tags = [_('Tag One'), _('Tag Two')]
     mi.publisher = _('Some publisher')
     mi.rating = 4
     mi.identifiers = {'isbn':'123456789', 'url': 'https://calibre-ebook.com'}
     mi.languages = ['eng', 'fra']
     mi.pubdate = mi.timestamp = now()
     return mi
Beispiel #11
0
 def default_mi(self):
     from calibre.ebooks.metadata.book.base import Metadata
     mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')])
     mi.series = _('A series of samples')
     mi.series_index = 4
     mi.tags = [_('Tag One'), _('Tag Two')]
     mi.publisher = _('Some publisher')
     mi.rating = 4
     mi.identifiers = {'isbn':'123456789', 'url': 'http://calibre-ebook.com'}
     mi.languages = ['eng', 'fra']
     mi.pubdate = mi.timestamp = now()
     return mi
Beispiel #12
0
    def parse(self, xml_detail):
        sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None
        authors = []
        tags = []
        xpath = self.XPath('//table[@id="record"]//tr')
        for row in xpath(xml_detail):
            ch = row.getchildren()
            txt = ch[0].text.strip()
            data = self.normalize(ch[1].text)
            if txt.startswith('245') and title is None:
                title = self.parse_title(data)
            if txt.startswith('246'):
                title = self.parse_title(data)
            elif txt.startswith('100') or txt.startswith('700'):
                res = self.parse_author(data)
                if res is not None:
                    authors.append(res)
            elif txt == 'SYS':
                sys_ident = data.strip()
            elif txt =='020':
                isbn = self.parse_isbn(data)
            elif txt == '260':
                publisher, pub_year = self.parse_publisher(data)
            elif txt.startswith('490') and serie is None:
                serie, serie_index = self.parse_serie(data)
            elif txt == '655 7':
                tags.append(self.parse_tags(data))

        if isbn is not None and isbn != '':
            cover = self.parse_cover(isbn)

        if title is not None and len(authors) > 0 and sys_ident is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.identifiers = {self.plugin.name:sys_ident}
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(sys_ident, cover)

            return mi
        else:
            self.log('Data not found')
            return None
Beispiel #13
0
 def test_input_meta_single(self):
     stream_meta = get_metadata(self.get_stream('meta_single'))
     canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ',
                           ['George Washington'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     # canon_meta.rating = float(0)
     # canon_meta.comments = ''
     canon_meta.tags = ['tag a', 'tag b']
     canon_meta.set_identifiers({'isbn': '1234567890'})
     self.compare_metadata(stream_meta, canon_meta)
Beispiel #14
0
def read_metadata(root, ver=None, return_extra_data=False):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key == 'uuid':
            ans.uuid = vals[0]
        else:
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        if bkp[0]:
            ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    ans.comments = read_comments(root, prefixes, refines) or ans.comments
    ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
    ans.tags = read_tags(root, prefixes, refines) or ans.tags
    ans.rating = read_rating(root, prefixes, refines) or ans.rating
    s, si = read_series(root, prefixes, refines)
    if s:
        ans.series, ans.series_index = s, si
    ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map
    ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories
    for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems():
        ans.set_user_metadata(name, fm)
    if return_extra_data:
        ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines)
    return ans
Beispiel #15
0
def read_metadata(root):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key != 'uuid':
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages

    return ans
Beispiel #16
0
    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        isbn = check_isbn(identifiers.get('isbn', None))
        if isbn is None:
            log.info("The plugin can only look up data by ISBN")
            return None

        url = self.QURL % isbn
        log.info('Querying: %s' % url)
        data = json.loads(
            self.browser.open_novisit(url, timeout=timeout).read())
        log.info('Got JSON data: %s' % data)

        if data.get('stat', None) != 'ok':
            return None
        data = data.get('list', [])
        if len(data) == 0:
            return None
        data = data[0]

        log.info("Book JSON data: %s" % data)

        title = data.get("title", None)
        authors = data.get("author", None)
        if title is None or authors is None:
            return None

        authors = authors.rstrip('.').replace(' and ',
                                              ',').replace(';', ',').split(',')

        mi = Metadata(title, [a.strip() for a in authors])
        mi.publisher = data.get("publisher", None)
        mi.languages = data.get("lang", None).split(',')

        year = data.get("year", None)
        if year is not None:
            mi.pubdate = datetime.datetime(int(year), 1, 1)

        log.info("Final formatted result: %s" % mi)
        result_queue.put(mi)
Beispiel #17
0
 def test_input_comment_single(self):
     stream_meta = get_metadata(self.get_stream('comment_single'))
     canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ',
                           ['James Madison', 'James Monroe'])
     canon_meta.publisher = 'Publisher C'
     canon_meta.languages = ['French']
     canon_meta.pubdate = parse_date('2015-01-01')
     canon_meta.timestamp = parse_date('2014-01-01')
     canon_meta.series = 'Comment Series'
     canon_meta.series_index = float(3)
     canon_meta.rating = float(0)
     canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
     canon_meta.tags = ['tag d']
     canon_meta.set_identifiers({
         'isbn': '3456789012',
         'url': 'http://google.com/search?q=calibre'
     })
     self.compare_metadata(stream_meta, canon_meta)
Beispiel #18
0
 def test_input_meta_multi(self):
     stream_meta = get_metadata(self.get_stream('meta_multi'))
     canon_meta = Metadata(
         'A Meta Tag &amp; Title Ⓒ',
         ['George Washington', 'John Adams', 'Thomas Jefferson'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English', 'Spanish']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     canon_meta.rating = float(8)
     canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
     canon_meta.tags = ['tag a', 'tag b', 'tag c']
     canon_meta.set_identifiers({
         'isbn': '1234567890',
         'url': 'http://google.com/search?q=calibre'
     })
     self.compare_metadata(stream_meta, canon_meta)
Beispiel #19
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.series = serie
            mi.series_index = serie_index
            return mi
        else:
            return None
Beispiel #20
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    return mi
Beispiel #21
0
    def parse_details(self, root):

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for query: %r' %
                               self.query)
            title = None

        if not title:
            self.log.error('Could not find title for %r' % self.query)

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for query: %r' %
                               self.query)
            authors = []

        if not authors:
            self.log.error('Could not find authors for %r' % self.query)

            return

        mi = Metadata(title, authors)

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                # match 10 of 13 getallen aan het begin, gevolgd door een spatie of niets
                p = re.compile('^([0-9]{13}|[0-9]{10})(?= |\Z)')
                if isinstance(isbn, str):
                    m = p.match(isbn)
                    if m:
                        mi.isbn = m.group()
                else:
                    m = p.match(isbn[0])
                    if m:
                        mi.isbn = m.group()
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            lang = self.parse_language(root)
            if lang:
                mi.languages = lang
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        try:
            lccn = self.parse_lccn(root)
            if lccn:
                if isinstance(lccn, str):
                    mi.set_identifier('lccn', lccn)
                else:
                    for identifier in lccn:
                        mi.set_identifier('lccn', identifier)
        except:
            self.log.exception('Error parsing LCCN for url: %r' % self.url)

        try:
            ddc = self.parse_ddc(root)
            if ddc:
                if isinstance(ddc, str):
                    mi.set_identifier('ddc', ddc)
                else:
                    for identifier in ddc:
                        mi.set_identifier('ddc', identifier)
        except:
            self.log.exception('Error parsing DDC for url: %r' % self.url)

        try:
            lcc = self.parse_lcc(root)
            if lcc:
                if isinstance(lcc, str):
                    mi.set_identifier('lcc', lcc)
                else:
                    for identifier in lcc:
                        mi.set_identifier('lcc', identifier)
        except:
            self.log.exception('Error parsing LCC for url: %r' % self.url)

        mi.source_relevance = self.relevance

        self.result_queue.put(mi)
Beispiel #22
0
    def convert(self, oeb_book, output, input_plugin, opts, logger):
        """Convert from calibre's internal format to KePub."""
        common.log.debug("Running ePub conversion")
        self.epub_output_plugin.convert(
            oeb_book, output, input_plugin, opts, common.log
        )
        common.log.debug("Done ePub conversion")
        container = KEPubContainer(output, common.log, opts.kepub_clean_markup)

        if container.is_drm_encumbered:
            common.log.error("DRM-encumbered container, skipping conversion")
            return

        # Write the details file
        o = {
            "kepub_output_version": ".".join([str(n) for n in self.version]),
            "kepub_output_currenttime": datetime.utcnow().ctime(),
        }
        kte_data_file = self.temporary_file("_KePubOutputPluginInfo")
        kte_data_file.write(json.dumps(o).encode("UTF-8"))
        kte_data_file.close()
        container.copy_file_to_container(
            kte_data_file.name, name="plugininfo.kte", mt="application/json"
        )

        title = container.opf_xpath("./opf:metadata/dc:title/text()")
        if len(title) > 0:
            title = title[0]
        else:
            title = NULL_VALUES["title"]
        authors = container.opf_xpath(
            './opf:metadata/dc:creator[@opf:role="aut"]/text()'
        )
        if len(authors) < 1:
            authors = NULL_VALUES["authors"]
        mi = Metadata(title, authors)
        language = container.opf_xpath("./opf:metadata/dc:language/text()")
        if len(language) > 0:
            mi.languages = language
            language = language[0]
        else:
            mi.languages = NULL_VALUES["languages"]
            language = NULL_VALUES["language"]

        try:
            common.modify_epub(
                container,
                output,
                metadata=mi,
                opts={
                    "hyphenate": opts.kepub_hyphenate,
                    "hyphen_min_chars": opts.kepub_hyphenate_chars,
                    "hyphen_min_chars_before": opts.kepub_hyphenate_chars_before,
                    "hyphen_min_chars_after": opts.kepub_hyphenate_chars_after,
                    "hyphen_limit_lines": opts.kepub_hyphenate_limit_lines,
                    "no-hyphens": opts.kepub_disable_hyphenation,
                    "smarten_punctuation": False,
                    "extended_kepub_features": True,
                },
            )
        except Exception:
            common.log.exception("Failed converting!")
            raise
Beispiel #23
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root)
                             or first_simple('//xmp:CreateDate', root),
                             assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = first_simple('//xmp:MetadataDate', root)
    if md:
        try:
            mi.metadata_date = parse_date(md)
        except:
            pass
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {
            'doi': check_doi,
            'isbn': check_isbn
    }.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #24
0
 def mi(self):
     mi = Metadata(unicode_type(self.title.text()).strip() or _('Unknown'))
     mi.authors = string_to_authors(unicode_type(self.authors.text()).strip()) or [_('Unknown')]
     mi.languages = self.languages.lang_codes or [get_lang()]
     return mi
Beispiel #25
0
    def __init__(self, parent, text, mi=None, fm=None, color_field=None,
                 icon_field_key=None, icon_rule_kind=None, doing_emblem=False,
                 text_is_placeholder=False, dialog_is_st_editor=False,
                 global_vars=None, all_functions=None, builtin_functions=None):
        QDialog.__init__(self, parent)
        Ui_TemplateDialog.__init__(self)
        self.setupUi(self)

        self.coloring = color_field is not None
        self.iconing = icon_field_key is not None
        self.embleming = doing_emblem
        self.dialog_is_st_editor = dialog_is_st_editor
        if global_vars is None:
            self.global_vars = {}
        else:
            self.global_vars = global_vars

        cols = []
        if fm is not None:
            for key in sorted(displayable_columns(fm),
                              key=lambda k: sort_key(fm[k]['name'] if k != color_row_key else 0)):
                if key == color_row_key and not self.coloring:
                    continue
                from calibre.gui2.preferences.coloring import all_columns_string
                name = all_columns_string if key == color_row_key else fm[key]['name']
                if name:
                    cols.append((name, key))

        self.color_layout.setVisible(False)
        self.icon_layout.setVisible(False)

        if self.coloring:
            self.color_layout.setVisible(True)
            for n1, k1 in cols:
                self.colored_field.addItem(n1 +
                       (' (' + k1 + ')' if k1 != color_row_key else ''), k1)
            self.colored_field.setCurrentIndex(self.colored_field.findData(color_field))
        elif self.iconing or self.embleming:
            self.icon_layout.setVisible(True)
            if self.embleming:
                self.icon_kind_label.setVisible(False)
                self.icon_kind.setVisible(False)
                self.icon_chooser_label.setVisible(False)
                self.icon_field.setVisible(False)

            for n1, k1 in cols:
                self.icon_field.addItem('{} ({})'.format(n1, k1), k1)
            self.icon_file_names = []
            d = os.path.join(config_dir, 'cc_icons')
            if os.path.exists(d):
                for icon_file in os.listdir(d):
                    icon_file = icu_lower(icon_file)
                    if os.path.exists(os.path.join(d, icon_file)):
                        if icon_file.endswith('.png'):
                            self.icon_file_names.append(icon_file)
            self.icon_file_names.sort(key=sort_key)
            self.update_filename_box()

            if self.iconing:
                dex = 0
                from calibre.gui2.preferences.coloring import icon_rule_kinds
                for i,tup in enumerate(icon_rule_kinds):
                    txt,val = tup
                    self.icon_kind.addItem(txt, userData=(val))
                    if val == icon_rule_kind:
                        dex = i
                self.icon_kind.setCurrentIndex(dex)
                self.icon_field.setCurrentIndex(self.icon_field.findData(icon_field_key))

        if dialog_is_st_editor:
            self.buttonBox.setVisible(False)
        else:
            self.new_doc_label.setVisible(False)
            self.new_doc.setVisible(False)
            self.template_name_label.setVisible(False)
            self.template_name.setVisible(False)

        if mi:
            if not isinstance(mi, list):
                mi = (mi, )
        else:
            mi = Metadata(_('Title'), [_('Author')])
            mi.author_sort = _('Author Sort')
            mi.series = ngettext('Series', 'Series', 1)
            mi.series_index = 3
            mi.rating = 4.0
            mi.tags = [_('Tag 1'), _('Tag 2')]
            mi.languages = ['eng']
            mi.id = 1
            if fm is not None:
                self.mi.set_all_user_metadata(fm.custom_field_metadata())
            else:
                # No field metadata. Grab a copy from the current library so
                # that we can validate any custom column names. The values for
                # the columns will all be empty, which in some very unusual
                # cases might cause formatter errors. We can live with that.
                from calibre.gui2.ui import get_gui
                mi.set_all_user_metadata(
                      get_gui().current_db.new_api.field_metadata.custom_field_metadata())
            for col in mi.get_all_user_metadata(False):
                mi.set(col, (col,), 0)
            mi = (mi, )
        self.mi = mi

        # Set up the display table
        self.table_column_widths = None
        try:
            self.table_column_widths = \
                        gprefs.get('template_editor_table_widths', None)
        except:
            pass
        tv = self.template_value
        tv.setRowCount(len(mi))
        tv.setColumnCount(2)
        tv.setHorizontalHeaderLabels((_('Book title'), _('Template value')))
        tv.horizontalHeader().setStretchLastSection(True)
        tv.horizontalHeader().sectionResized.connect(self.table_column_resized)
        # Set the height of the table
        h = tv.rowHeight(0) * min(len(mi), 5)
        h += 2 * tv.frameWidth() + tv.horizontalHeader().height()
        tv.setMinimumHeight(h)
        tv.setMaximumHeight(h)
        # Set the size of the title column
        if self.table_column_widths:
            tv.setColumnWidth(0, self.table_column_widths[0])
        else:
            tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10)
        # Use our own widget to get rid of elision. setTextElideMode() doesn't work
        for r in range(0, len(mi)):
            w = QLineEdit(tv)
            w.setReadOnly(True)
            tv.setCellWidget(r, 0, w)
            w = QLineEdit(tv)
            w.setReadOnly(True)
            tv.setCellWidget(r, 1, w)

        # Remove help icon on title bar
        icon = self.windowIcon()
        self.setWindowFlags(self.windowFlags()&(~Qt.WindowType.WindowContextHelpButtonHint))
        self.setWindowIcon(icon)

        self.all_functions = all_functions if all_functions else formatter_functions().get_functions()
        self.builtins = (builtin_functions if builtin_functions else
                         formatter_functions().get_builtins_and_aliases())

        self.last_text = ''
        self.highlighter = TemplateHighlighter(self.textbox.document(), builtin_functions=self.builtins)
        self.textbox.cursorPositionChanged.connect(self.text_cursor_changed)
        self.textbox.textChanged.connect(self.textbox_changed)

        self.textbox.setTabStopWidth(10)
        self.source_code.setTabStopWidth(10)
        self.documentation.setReadOnly(True)
        self.source_code.setReadOnly(True)

        if text is not None:
            if text_is_placeholder:
                self.textbox.setPlaceholderText(text)
                self.textbox.clear()
                text = ''
            else:
                self.textbox.setPlainText(text)
        else:
            text = ''
        self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText(_('&OK'))
        self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText(_('&Cancel'))
        self.color_copy_button.clicked.connect(self.color_to_clipboard)
        self.filename_button.clicked.connect(self.filename_button_clicked)
        self.icon_copy_button.clicked.connect(self.icon_to_clipboard)

        try:
            with open(P('template-functions.json'), 'rb') as f:
                self.builtin_source_dict = json.load(f, encoding='utf-8')
        except:
            self.builtin_source_dict = {}

        func_names = sorted(self.all_functions)
        self.function.clear()
        self.function.addItem('')
        for f in func_names:
            self.function.addItem('{}  --  {}'.format(f,
                               self.function_type_string(f, longform=False)), f)
        self.function.setCurrentIndex(0)
        self.function.currentIndexChanged.connect(self.function_changed)
        self.display_values(text)
        self.rule = (None, '')

        tt = _('Template language tutorial')
        self.template_tutorial.setText(
            '<a href="%s">%s</a>' % (
                localize_user_manual_link('https://manual.calibre-ebook.com/template_lang.html'), tt))
        tt = _('Template function reference')
        self.template_func_reference.setText(
            '<a href="%s">%s</a>' % (
                localize_user_manual_link('https://manual.calibre-ebook.com/generated/en/template_ref.html'), tt))

        self.set_up_font_boxes()
        self.textbox.setFocus()
        # Now geometry
        try:
            geom = gprefs.get('template_editor_dialog_geometry', None)
            if geom is not None:
                QApplication.instance().safe_restore_geometry(self, QByteArray(geom))
        except Exception:
            pass
Beispiel #26
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title = self._field_for('title',
                                   book_id,
                                   default_value=_('Unknown'))
        mi.authors = aum
        mi.author_sort = self._field_for('author_sort',
                                         book_id,
                                         default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments = self._field_for('comments', book_id)
        mi.publisher = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid = self._field_for('uuid', book_id, default_value='dummy')
        mi.title_sort = self._field_for('sort',
                                        book_id,
                                        default_value=_('Unknown'))
        mi.book_size = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice',
                                          book_id,
                                          default_value='')
        mi.last_modified = self._field_for('last_modified',
                                           book_id,
                                           default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for(
            'cover', book_id, default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index',
                                              book_id,
                                              default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(
            self._field_for('identifiers', book_id, default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key + '_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name, cat, ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name, cat])
                    elif name == v:
                        res.append([name, cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #27
0
	def parse_details(self, root):
		try:
			moly_id = self.parse_moly_id(self.url)
			self.log.info('Parsed moly.hu identifier: %s'%moly_id)
		except:
			self.log.exception('Error parsing moly.hu id for url: %r'%self.url)
			moly_id = None

		try:
			title = self.parse_title(root)
			self.log.info('Parsed title: %s'%title)
		except:
			self.log.exception('Error parsing title for url: %r'%self.url)
			title = None
		
		try:
			authors = self.parse_authors(root)
			self.log.info('Parsed authors: %s'%authors)
		except:
			self.log.exception('Error parsing authors for url: %r'%self.url)
			authors = []

		if not title or not authors or not moly_id:
			self.log.error('Could not find title/authors/moly.hu id for %r'%self.url)
			self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('moly_hu', moly_id)
		self.moly_id = moly_id

		try:
			isbn = self.parse_isbn(root)
			self.log.info('Parsed ISBN: %s'%isbn)
			if isbn:
				self.isbn = mi.isbn = isbn
		except:
			self.log.exception('Error parsing ISBN for url: %r'%self.url)
		
		try:
			series_info = self.parse_series(root)
			if series_info is not None:
				mi.series = series_info[0]
				mi.series_index = int(series_info[1])
				self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index))
		except:
			self.log.exception('Error parsing series for url: %r'%self.url)
			
		try:
			mi.comments = self.parse_comments(root)
			self.log.info('Parsed comments: %s'%mi.comments)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_covers(root)
			self.log.info('Parsed URL for cover: %r'%self.cover_url)
			self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url)
			mi.has_cover = bool(self.cover_url)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)

		try:
			mi.tags = self.parse_tags(root)
			self.log.info('Parsed tags: %s'%mi.tags)
		except:
			self.log.exception('Error parsing tags for url: %r'%self.url)
			
		try:
			mi.languages = self.parse_languages(mi.tags)
			self.log.info('Parsed languages: %r'%mi.languages)
		except:
			self.log.exception('Error parsing language for url: %r'%self.url)
			
		try:
			mi.publisher = self.parse_publisher(root)
			self.log.info('Parsed publisher: %s'%mi.publisher)
		except:
			self.log.exception('Error parsing publisher for url: %r'%self.url)	
			
		try:
			mi.pubdate = self.parse_published_date(root)
			self.log.info('Parsed publication date: %s'%mi.pubdate)
		except:
			self.log.exception('Error parsing published date for url: %r'%self.url)
			
		try:
			mi.rating = self.parse_rating(root)
			self.log.info('Parsed rating: %s\n\n'%mi.rating)
		except:
			self.log.exception('Error parsing tags for url: %r\n\n'%self.url)


		mi.source_relevance = self.relevance

		if self.moly_id and self.isbn:
			self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id)

		self.plugin.clean_downloaded_metadata(mi)

		self.result_queue.put(mi)
Beispiel #28
0
    def parse_book_page(self, url):
        # TODO: Support for login-based rating fetching
        # TODO: Move all parsing logic to methods in order to avoid dangling variables
        # TODO: Saving metadata in custom columns
        # TODO: Configurable embedding metadata in comment
        # TODO: missing items
        # original language, first polish publish date, publisher serie, form

        self.log.info('INFO: Downloading book page: {}'.format(url))
        root_tag = self.get_lxml_root(url)

        if not root_tag:
            return None

        book_tag = self.get_book_tag(root_tag)

        if self.prefs['title']:
            book_title = self.parse_title(root_tag, book_tag, url)
        else:
            book_title = self.title

        if self.prefs['authors']:
            book_authors = self.parse_authors(root_tag, book_tag, url)
        else:
            book_authors = self.authors

        mi = Metadata(book_title, book_authors)
        additional_meta = {}

        if self.enabled('languages'):
            languages = self.parse_languages(root_tag, book_tag, url)
            if languages:
                mi.languages = languages

        if self.enabled('rating'):
            rating = self.parse_rating(root_tag, book_tag, url)
            if rating != None:
                mi.rating = rating

        if self.enabled('tags'):
            tags = self.parse_tags(root_tag, book_tag, url)
            if tags:
                mi.tags = tags

        if self.enabled('identifier'):
            identifier = self.parse_identifier(root_tag, book_tag, url)
            if identifier:
                mi.set_identifier(IDENTIFIER, identifier)

        if self.enabled('pubdate'):
            pubdate = self.parse_pubdate(root_tag, book_tag, url)
            if pubdate:
                mi.pubdate = pubdate

        if self.enabled('covers'):
            covers = self.parse_covers(root_tag, book_tag, url)
            if covers:
                mi.has_cover = True
                self.plugin.cached_identifier_to_cover_url('urls').extend(
                    covers)
            else:
                self.plugin.cache_identifier_to_cover_url('nocover', True)
                # TODO: is this necessary?

        if self.enabled('series'):
            series = self.parse_series(root_tag, book_tag, url)
            if series:
                additional_meta['series'] = [
                    self.get_series_string(name, index)
                    for name, index in series
                ]
                name, index = series[0]
                mi.series = name
                if index is not None:
                    mi.series_index = index

        if self.enabled('translators'):
            translators = self.parse_translators(root_tag, book_tag, url)
            if translators:
                additional_meta['translators'] = translators

        if self.enabled('original_title'):
            original_title = self.parse_original_title(root_tag, book_tag, url)
            if original_title:
                additional_meta['original_title'] = original_title

        if self.enabled('categories'):
            categories = self.parse_categories(root_tag, book_tag, url)
            if categories:
                additional_meta['categories'] = categories

        if self.enabled('genres'):
            genres = self.parse_genres(root_tag, book_tag, url)
            if genres:
                additional_meta['genres'] = genres

        if self.enabled('comments'):
            comments = self.parse_comments(root_tag, book_tag, url) or ''
            additional_comments = self.format_additional_comment(
                additional_meta)

            if comments or additional_comments:
                mi.comments = comments + additional_comments

        self.log.info('INFO: Parsing book page completed')

        return mi
Beispiel #29
0
    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        self.load_config()

        # get identifying tags from book
        idn = identifiers.get('dnb-idn', None)
        isbn = check_isbn(identifiers.get('isbn', None))

        # ignore unknown authors
        if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt":
            authors = None

        if (isbn is None) and (idn is None) and (title is None) and (authors is
                                                                     None):
            log.info(
                "This plugin requires at least either ISBN, IDN, Title or Author(s)."
            )
            return None

        queries = []
        # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
        exact_search = {}

        if idn is not None:
            queries.append('num=' + idn)
            exact_search['idn'] = idn

        else:
            authors_v = []
            title_v = []

            if authors is not None:
                authors_v.append(' '.join(authors))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=False)))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=True)))

            if title is not None:
                title_v.append(title)
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=False)))
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=True)))

            if isbn is not None:
                exact_search['isbn'] = isbn

            # title and author
            if authors is not None and title is not None:
                for a in authors_v:
                    for t in title_v:
                        if isbn is not None:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '" AND num="' + isbn + '"')
                        else:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '" AND num="' + isbn + '"')
                else:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '"')

            # title but no author
            elif authors is not None and title is None:
                for i in authors_v:
                    if isbn is not None:
                        queries.append('per="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('per="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('tit="' + authors[0] + '" AND num="' +
                                   isbn + '"')
                else:
                    queries.append('tit="' + authors[0] + '"')

            # author but no title
            elif authors is None and title is not None:
                for i in title_v:
                    if isbn is not None:
                        queries.append('tit="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('tit="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND num="' + isbn +
                                   '"')
                else:
                    queries.append('per="' + title + '"')

            # as last resort only use isbn
            if isbn is not None:
                queries.append('num=' + isbn)

            # Sort queries descending by length (assumption: longer query -> less but better results)
            #queries.sort(key=len)
            #queries.reverse()

        # remove duplicate queries
        uniqueQueries = []
        for i in queries:
            if i not in uniqueQueries:
                uniqueQueries.append(i)

        # Process queries
        results = None

        for query in uniqueQueries:
            query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)'
            log.info(query)

            if self.cfg_dnb_token is None:
                results = self.getSearchResultsByScraping(log, query, timeout)
            else:
                results = self.getSearchResults(log, query, timeout)

            if results is None:
                continue

            log.info("Parsing records")

            ns = {'marc21': 'http://www.loc.gov/MARC21/slim'}
            for record in results:
                series = None
                series_index = None
                publisher = None
                pubdate = None
                languages = []
                title = None
                title_sort = None
                edition = None
                comments = None
                idn = None
                urn = None
                isbn = None
                ddc = []
                subjects_gnd = []
                subjects_non_gnd = []

                # Title: Field 245
                title_parts = []
                # if a,n,p exist: series = a, series_index = n, title = p
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..",
                        namespaces=ns):
                    series_index = i.xpath(".//marc21:subfield[@code='n']",
                                           namespaces=ns)[0].text.strip()
                    match = re.search("(\d+[,\.\d+]?)", series_index)
                    if match:
                        series_index = match.group(1)
                    else:
                        series_index = "0"  # looks like sometimes DNB does not know the series index and uses something like "[...]"
                    series_index = series_index.replace(',', '.')
                    series = i.xpath(".//marc21:subfield[@code='a']",
                                     namespaces=ns)[0].text.strip()
                    title_parts.append(
                        i.xpath(".//marc21:subfield[@code='p']",
                                namespaces=ns)[0].text.strip())
                    log.info("Extracted Series: %s" % series)
                    log.info("Extracted Series Index: %s" % series_index)
                    break
                # otherwise: title = a
                if len(title_parts) == 0:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):
                        title_parts.append(i.text.strip())
                        break

                # subtitle 1
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns):
                    title_parts.append(i.text.strip())
                    break

                # subtitle 2
                #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
                #    title = title + " / " + i.text.strip()
                #    break

                title = " : ".join(title_parts)
                log.info("Extracted Title: %s" % title)

                # Title_Sort
                title_sort_parts = list(title_parts)
                title_sort_regex = re.match(
                    '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$',
                    title_parts[0])
                sortword = title_sort_regex.group(2)
                if sortword:
                    title_sort_parts[0] = ''.join(
                        filter(None, [
                            title_sort_regex.group(1).strip(),
                            title_sort_regex.group(3).strip(), ", " + sortword
                        ]))
                title_sort = " : ".join(title_sort_parts)
                log.info("Extracted Title_Sort: %s" % title_sort)

                # Authors
                authors = []
                author_sort = None
                for i in record.xpath(
                        ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # primary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # secondary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                if len(
                        authors
                ) == 0:  # if no "real" autor was found take all persons involved
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):  # secondary authors
                        name = re.sub(" \[.*\]$", "", i.text.strip())
                        authors.append(name)
                if len(authors) > 0:
                    author_sort = authors[0]
                log.info("Extracted Authors: %s" % " & ".join(authors))

                # Comments
                for i in record.xpath(
                        ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",
                        namespaces=ns):
                    if i.text.startswith("http://deposit.dnb.de/"):
                        br = self.browser
                        log.info('Downloading Comments from: %s' % i.text)
                        try:
                            comments = br.open_novisit(i.text,
                                                       timeout=30).read()
                            comments = sanitize_comments_html(comments)
                            log.info('Comments: %s' % comments)
                            break
                        except:
                            log.info("Could not download Comments from %s" % i)

                # Publisher Name and Location
                publisher_name = None
                publisher_location = None
                fields = record.xpath(
                    ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                    namespaces=ns)
                if len(fields) > 0:
                    publisher_name = fields[0].xpath(
                        ".//marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                    publisher_location = fields[0].xpath(
                        ".//marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                else:
                    fields = record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",
                        namespaces=ns)
                    if len(fields) > 0:
                        publisher_name = fields[0].xpath(
                            ".//marc21:subfield[@code='b' and string-length(text())>0]",
                            namespaces=ns)[0].text.strip()
                    else:
                        fields = record.xpath(
                            ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",
                            namespaces=ns)
                        if len(fields) > 0:
                            publisher_location = fields[0].xpath(
                                ".//marc21:subfield[@code='a' and string-length(text())>0]",
                                namespaces=ns)[0].text.strip()

                log.info("Extracted Publisher: %s" % publisher_name)
                log.info("Extracted Publisher Location: %s" %
                         publisher_location)

                # Publishing Date
                for i in record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",
                        namespaces=ns):
                    match = re.search("(\d{4})", i.text.strip())
                    if match is not None:
                        year = match.group(1)
                        pubdate = datetime.datetime(int(year), 1, 2)
                        break
                log.info("Extracted Publication Year: %s" % pubdate)

                # ID: IDN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    idn = i.text.strip()
                    break
                log.info("Extracted ID IDN: %s" % idn)
                if "idn" in exact_search:
                    if idn != exact_search["idn"]:
                        log.info(
                            "Extracted IDN does not match book's IDN, skipping record"
                        )
                        continue

                # ID: URN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    urn = i.text.strip()
                    break
                log.info("Extracted ID URN: %s" % urn)

                # ID: ISBN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
                    match = re.search(isbn_regex, i.text.strip())
                    isbn = match.group()
                    isbn = isbn.replace('-', '')
                    break
                log.info("Extracted ID ISBN: %s" % isbn)
                if "isbn" in exact_search:
                    if isbn != exact_search["isbn"]:
                        log.info(
                            "Extracted ISBN does not match book's ISBN, skipping record"
                        )
                        continue

                # ID: Sachgruppe (DDC)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    ddc.append(i.text.strip())
                log.info("Extracted ID DDC: %s" % ",".join(ddc))

                # Series and Series_Index
                if series is None and series_index is None:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                            namespaces=ns):
                        # Series Index
                        series_index = i.xpath(".//marc21:subfield[@code='v']",
                                               namespaces=ns)[0].text.strip()
                        match = re.search("(\d+[,\.\d+]?)", series_index)
                        if match is not None:
                            series_index = match.group(1)
                        else:
                            series_index = "0"
                        series_index = series_index.replace(',', '.')
                        log.info("Extracted Series Index: %s" % series_index)
                        # Series
                        series = i.xpath(".//marc21:subfield[@code='a']",
                                         namespaces=ns)[0].text.strip()
                        log.info("Extracted Series: %s" % series)
                        break

                # Try to extract Series, Series Index and Title from the fetched title.
                # Caution: This overwrites DNB's series/series_index and modifies the title!
                if self.cfg_guess_series is True:
                    guessed_series = None
                    guessed_series_index = None
                    parts = re.split("[:]",
                                     self.removeSortingCharacters(title))
                    if len(parts) == 2:
                        if bool(re.search("\d", parts[0])) != bool(
                                re.search("\d", parts[1])):
                            # figure out which part contains the index
                            if bool(re.search("\d", parts[0])):
                                indexpart = parts[0]
                                textpart = parts[1]
                            else:
                                indexpart = parts[1]
                                textpart = parts[0]

                            match = re.match(
                                "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart
                            )  # remove odd characters from start and end of the text part
                            if match:
                                textpart = match.group(1)

                            # from Titleparts like: "Name of the series - Episode 2"	OK
                            match = re.match(
                                "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                indexpart)
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                if guessed_series is None:
                                    guessed_series = textpart
                                    title = textpart + " : Band " + guessed_series_index
                                else:
                                    title = textpart
                            else:
                                # from Titleparts like: "Episode 2 Name of the series"
                                match = re.match(
                                    "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$",
                                    indexpart)
                                if match:
                                    guessed_series_index = match.group(1)
                                    guessed_series = match.group(2)
                                    if guessed_series is None:
                                        guessed_series = textpart
                                        title = textpart + " : Band " + guessed_series_index
                                    else:
                                        title = textpart
                    elif len(parts) == 1:
                        # from Titles like: "Name of the series - Title (Episode 2)"
                        match = re.match(
                            "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                            parts[0])
                        if match:
                            guessed_series_index = match.group(3)
                            guessed_series = match.group(1)
                            title = match.group(2)

                        else:
                            # from Titles like: "Name of the series - Episode 2"
                            match = re.match(
                                "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                parts[0])
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                title = guessed_series + " : Band " + guessed_series_index

                    if guessed_series is not None and guessed_series_index is not None:
                        series = guessed_series
                        series_index = guessed_series_index
                        log.info("Guessed Series: %s" % series)
                        log.info("Guessed Series Index: %s" % series_index)

                # GND Subjects from 689
                for i in record.xpath(
                        ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    subjects_gnd.append(i.text.strip())
                # GND Subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        if i.text.startswith("("):
                            continue
                        subjects_gnd.append(i.text)
                log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))

                # Non-GND subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        # ignore entries starting with "(":
                        if i.text.startswith("("):
                            continue
                        subjects_non_gnd.extend(re.split(',|;', i.text))
                # remove one-character subjects:
                for i in subjects_non_gnd:
                    if len(i) < 2:
                        subjects_non_gnd.remove(i)
                log.info("Extracted non-GND Subjects: %s" %
                         " ".join(subjects_non_gnd))

                # Edition
                for i in record.xpath(
                        ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    edition = i.text.strip()
                    break
                log.info("Extracted Edition: %s" % edition)

                # Languages
                for i in record.xpath(
                        ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    languages.append(i.text.strip())
                if languages is not None:
                    log.info("Extracted Languages: %s" % ",".join(languages))

                # Put it all together
                if self.cfg_append_edition_to_title == True and edition is not None:
                    title = title + " : " + edition

                mi = Metadata(
                    self.removeSortingCharacters(title),
                    map(lambda i: self.removeSortingCharacters(i), authors))
                mi.title_sort = self.removeSortingCharacters(title_sort)
                mi.author_sort = self.removeSortingCharacters(author_sort)
                mi.languages = languages
                mi.pubdate = pubdate
                mi.publisher = " : ".join(
                    filter(None, [
                        publisher_location,
                        self.removeSortingCharacters(publisher_name)
                    ]))
                mi.series = self.removeSortingCharacters(series)
                mi.series_index = series_index
                mi.comments = comments
                mi.isbn = isbn  # also required for cover download
                mi.set_identifier('urn', urn)
                mi.set_identifier('dnb-idn', idn)
                mi.set_identifier('ddc', ",".join(ddc))

                if self.cfg_fetch_subjects == 0:
                    mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 1:
                    if len(subjects_gnd) > 0:
                        mi.tags = self.uniq(subjects_gnd)
                    else:
                        mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 2:
                    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
                elif self.cfg_fetch_subjects == 3:
                    if len(subjects_non_gnd) > 0:
                        mi.tags = self.uniq(subjects_non_gnd)
                    else:
                        mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 4:
                    mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 5:
                    mi.tags = []

                # put current result's metdata into result queue
                log.info("Final formatted result: %s" % mi)
                result_queue.put(mi)
 def set_mi(self, mi, fm):
     '''
     This sets the metadata for the test result books table. It doesn't reset
     the contents of the field selectors for editing rules.
     '''
     self.fm = fm
     if mi:
         if not isinstance(mi, list):
             mi = (mi, )
     else:
         mi = Metadata(_('Title'), [_('Author')])
         mi.author_sort = _('Author Sort')
         mi.series = ngettext('Series', 'Series', 1)
         mi.series_index = 3
         mi.rating = 4.0
         mi.tags = [_('Tag 1'), _('Tag 2')]
         mi.languages = ['eng']
         mi.id = 1
         if self.fm is not None:
             mi.set_all_user_metadata(self.fm.custom_field_metadata())
         else:
             # No field metadata. Grab a copy from the current library so
             # that we can validate any custom column names. The values for
             # the columns will all be empty, which in some very unusual
             # cases might cause formatter errors. We can live with that.
             from calibre.gui2.ui import get_gui
             fm = get_gui().current_db.new_api.field_metadata
             mi.set_all_user_metadata(fm.custom_field_metadata())
         for col in mi.get_all_user_metadata(False):
             if fm[col]['datatype'] == 'datetime':
                 mi.set(col, DEFAULT_DATE)
             elif fm[col]['datatype'] in ('int', 'float', 'rating'):
                 mi.set(col, 2)
             elif fm[col]['datatype'] == 'bool':
                 mi.set(col, False)
             elif fm[col]['is_multiple']:
                 mi.set(col, (col, ))
             else:
                 mi.set(col, col, 1)
         mi = (mi, )
     self.mi = mi
     tv = self.template_value
     tv.setColumnCount(2)
     tv.setHorizontalHeaderLabels((_('Book title'), _('Template value')))
     tv.horizontalHeader().setStretchLastSection(True)
     tv.horizontalHeader().sectionResized.connect(self.table_column_resized)
     tv.setRowCount(len(mi))
     # Set the height of the table
     h = tv.rowHeight(0) * min(len(mi), 5)
     h += 2 * tv.frameWidth() + tv.horizontalHeader().height()
     tv.setMinimumHeight(h)
     tv.setMaximumHeight(h)
     # Set the size of the title column
     if self.table_column_widths:
         tv.setColumnWidth(0, self.table_column_widths[0])
     else:
         tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10)
     tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
     tv.setRowCount(len(mi))
     # Use our own widget to get rid of elision. setTextElideMode() doesn't work
     for r in range(0, len(mi)):
         w = QLineEdit(tv)
         w.setReadOnly(True)
         tv.setCellWidget(r, 0, w)
         w = QLineEdit(tv)
         w.setReadOnly(True)
         tv.setCellWidget(r, 1, w)
     self.display_values('')
Beispiel #31
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title       = self._field_for('title', book_id,
                default_value=_('Unknown'))
        mi.authors     = aum
        mi.author_sort = self._field_for('author_sort', book_id,
                default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments    = self._field_for('comments', book_id)
        mi.publisher   = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp   = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate     = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid        = self._field_for('uuid', book_id,
                default_value='dummy')
        mi.title_sort  = self._field_for('sort', book_id,
                default_value=_('Unknown'))
        mi.book_size   = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice', book_id, default_value='')
        mi.last_modified = self._field_for('last_modified', book_id,
                default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for('cover', book_id,
                default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index', book_id,
                    default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(self._field_for('identifiers', book_id,
            default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key+'_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name,cat,ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name,cat])
                    elif name == v:
                        res.append([name,cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #32
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_("Unknown"))
    title = first_alt("//dc:title", root)
    if title:
        mi.title = title
    authors = multiple_sequences("//dc:creator", root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences("//dc:subject", root) or multiple_sequences("//pdf:Keywords", root)
    if tags:
        mi.tags = tags
    comments = first_alt("//dc:description", root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences("//dc:publisher", root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(
            first_sequence("//dc:date", root) or first_simple("//xmp:CreateDate", root), assume_utc=False
        )
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple("//xmp:CreatorTool", root)
    if bkp:
        mi.book_producer = bkp
    md = first_simple("//xmp:MetadataDate", root)
    if md:
        try:
            mi.metadata_date = parse_date(md)
        except:
            pass
    rating = first_simple("//calibre:rating", root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ("title_sort", "author_sort"):
        for elem in XPath("//calibre:" + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ("author_link_map", "user_categories"):
        val = first_simple("//calibre:" + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences("//dc:language", root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath("//xmp:Identifier")(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ("prism", "pdfx"):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple("//%s:%s" % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == "isbn":
                    val = check_isbn(val)
                elif scheme == "doi":
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {"doi": check_doi, "isbn": check_isbn}.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple("//dc:identifier", root))
            if val:
                identifiers["doi"] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #33
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError(
                'Corrupted XMP metadata packet detected, probably generated by Nitro PDF'
            )
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = [au for aus in authors for au in string_to_authors(aus)]
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root)
                             or first_simple('//xmp:CreateDate', root),
                             assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in iteritems({
            'doi': check_doi,
            'isbn': check_isbn
    }):
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #34
0
    def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30):
	self.load_config()

	if authors is None:
	    authors=[]

	# get identifying tags from book
	idn = identifiers.get('dnb-idn', None)
	isbn = check_isbn(identifiers.get('isbn', None))

	# ignore unknown authors
	ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ]
	for i in ignored_authors:
	    authors = [ x for x in authors if x != i ]

	if (isbn is None) and (idn is None) and (title is None) and (authors is None):
	    log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).")
	    return None


	queries=[]
	# DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
	exact_search = {}

	if idn is not None:
	    exact_search['idn'] = idn
	    # in case look for a IDN only search for the IDN and skip all the other stuff
	    queries.append('num='+idn)

	else:
	    authors_v = []
	    title_v = []

	    # create some variants of given authors
	    if authors != []:
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False)))	# concat all author names ("Peter Meier Luise Stark")
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True)))	# use only first author
		for a in authors:
		    authors_v.append(a)	# use all authors, one by one

		# remove duplicates
		unique_authors_v = []
		for i in authors_v:
		    if i not in unique_authors_v:
			unique_authors_v.append(i)


	    # create some variants of given title
	    if title is not None:
		title_v.append(title)	# simply use given title
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False)))	# remove some punctation characters
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True)))	# remove subtitle (everything after " : ")
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False)))	# remove some punctation characters and joiners ("and", "&", ...)
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)))	# remove subtitle (everything after " : ") and joiners ("and", "&", ...)
		# TODO: remove subtitle after " - "

		# remove duplicates
		unique_title_v = []
		for i in title_v:
		    if i not in unique_title_v:
			unique_title_v.append(i)


	    # title and author
	    if authors_v != [] and title_v != []:
		for a in authors_v:
		    for t in title_v:
			if isbn is not None:
			    queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"')
			else:
			    queries.append('tit="' + t + '" AND per="' + a + '"')

		# try with first author as title and title (without subtitle) as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')

		# try with author and title (without subtitle) in any index
		if isbn is not None:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # author but no title
	    elif authors_v != [] and title_v == []:
		for i in authors_v:
		    if isbn is not None:
			queries.append('per="'+ i +'" AND num="' + isbn + '"')
		    else:
			queries.append('per="'+ i +'"')

		# try with author as title
		if isbn is not None:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # title but no author
	    elif authors_v == [] and title_v != []:
		for i in title_v:
		    if isbn is not None:
			queries.append('tit="' + i + '" AND num="' + isbn + '"')
		    else:
			queries.append('tit="' + i + '"')

		# try with title as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"')


	    # as last resort only use isbn
	    if isbn is not None:
		queries.append('num=' + isbn)


	# remove duplicate queries
	uniqueQueries=[]
	for i in queries:
	    if i not in uniqueQueries:
		uniqueQueries.append(i)


	# Process queries
	results = None

	for query in uniqueQueries:
	    # SRU does not work with "+" or "?" characters in query, so we simply remove them
	    query = re.sub('[\+\?]','',query)

	    query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
	    log.info(query)

	    if self.cfg_dnb_token is None:
		results = self.getSearchResultsByScraping(log, query, timeout)
	    else:
		results = self.getSearchResults(log, query, timeout)

	    if results is None:
		continue

	    log.info("Parsing records")

	    ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' }
	    for record in results:
		series = None
		series_index = None
		publisher = None
		pubdate = None
		languages = []
		title = None
		title_sort = None
		authors = []
		author_sort = None
		edition = None
		comments = None
		idn = None
		urn = None
		isbn = None
		ddc = []
		subjects_gnd = []
		subjects_non_gnd = []
		publisher_name = None
		publisher_location = None


		##### Field 264 #####
		# Publisher Name and Location
		fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns)
		if len(fields)>0:
		    publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		else:
		    fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns)
		    if len(fields)>0:
			publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    else:
			fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns)
			if len(fields)>0:
			    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();

		# Publishing Date
		for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns):
		    match = re.search("(\d{4})", i.text.strip())
		    if match is not None:
			year = match.group(1)
			pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0)
			break

		# Log
		if publisher_name is not None:
		    log.info("Extracted Publisher: %s" % publisher_name)
		if publisher_location is not None:
		    log.info("Extracted Publisher Location: %s" % publisher_location)
		if pubdate is not None:
		    log.info("Extracted Publication Year: %s" % pubdate)


		##### Field 245 ####
		# Title/Series/Series_Index
		title_parts = []
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
		    # if a,n,p,n,p,n,p exist:		series = a + n0 + " - " + p0 + n1 + " - " + p1,	series_index = n2,	title = p2
		    # if a,n,p,n,p exist:		series = a + n0 + " - " + p0, 			series_index = n1,	title = p1	(Example: dnb-id 1008774839)
		    # if a,n,p exist:			series = a,					series_index = n,	title = p
		    # if a exist:												title = a
		    # TODO: a,n,p,n (i.e. 956375146)

		    code_p = []
		    code_n = []
		    code_a = []

		    for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns):
			code_p.append(j.text.strip())

		    for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns):
			match = re.search("(\d+[,\.\d+]?)", j.text.strip())
			if match:
			    code_n.append(match.group(1))
			else:
			    code_n.append("0")	# looks like sometimes DNB does not know the series index and uses something like "[...]"

		    for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns):
			code_a.append(j.text.strip())

		    if len(code_p) == 0:
			title_parts = title_parts + code_a

		    elif len(code_p)>0 and len(code_p) == len(code_n):
			series = " : ".join(code_a)	# I've never seen more than one code_a, but who knows...
			for i in range (0,len(code_p)-1):
			    series = series + " " + code_n[i] + " " + code_p[i]
			series_index = code_n[-1]
			title_parts.append(code_p[-1])


		# subtitle 1: Field 245
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns):
		    title_parts.append(i.text.strip())
		    break
		
		# subtitle 2
		#for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
		#    title = title + " / " + i.text.strip()
		#    break

		title = " : ".join(title_parts)

		# Log
		if series_index is not None:
		    log.info("Extracted Series_Index from Field 245: %s" % series_index)
		if series is not None:
		    log.info("Extracted Series from Field 245: %s" % series)
		    series = self.cleanUpSeries(log, series, publisher_name)
		if title is not None:
		    log.info("Extracted Title: %s" % title)
		    title = self.cleanUpTitle(log, title)

		# Title_Sort
		if len(title_parts)>0:
		    title_sort_parts = list(title_parts)
		    title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0])
		    sortword = title_sort_regex.group(2)
		    if sortword:
			title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword]))
		    title_sort = " : ".join(title_sort_parts)

		# Log
		if title_sort is not None:
		    log.info("Extracted Title_Sort: %s" % title_sort)


		##### Field 100 and Field 700 #####
		# Authors
		for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# primary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		if len(authors)==0:	# if no "real" autor was found take all persons involved
		    for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
			name = re.sub(" \[.*\]$","",i.text.strip());
			authors.append(name)
		if len(authors)>0:
		    author_sort = authors[0]

		# Log
		if len(authors)>0:
		    log.info("Extracted Authors: %s" % " & ".join(authors))
		if author_sort is not None:
		    log.info("Extracted Author_Sort: %s" % " & ".join(authors))


		##### Field 856 #####
		# Comments
		for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
		    if i.text.startswith("http://deposit.dnb.de/"):
			br = self.browser
			log.info('Downloading Comments from: %s' % i.text)
			try:
			    comments = br.open_novisit(i.text, timeout=30).read()
			    comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
			    comments = sanitize_comments_html(comments)
			    break
			except:
			    log.info("Could not download Comments from %s" % i)

		# Log
		if comments is not None:
		    log.info('Comments: %s' % comments)

		# If no comments are found for this edition, look at other editions of this book (Fields 776)
		# TODO: Make this configurable (default: yes)
		if comments is None:
		    # get all other issues
		    for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns):
			other_idn = re.sub("^\(.*\)","",i.text.strip());
			subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
			log.info(subquery)

			if self.cfg_dnb_token is None:
			    subresults = self.getSearchResultsByScraping(log, subquery, timeout)
			else:
			    subresults = self.getSearchResults(log, subquery, timeout)

			if subresults is None:
			    continue

			for subrecord in subresults:
			    for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
				if i.text.startswith("http://deposit.dnb.de/"):
				    br = self.browser
				    log.info('Downloading Comments from: %s' % i.text)
				    try:
					comments = br.open_novisit(i.text, timeout=30).read()
					comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
					comments = sanitize_comments_html(comments)
					break
				    except:
					log.info("Could not download Comments from %s" % i)
			    if comments is not None:
				log.info('Comments from other issue: %s' % comments)
				break


		##### Field 16 #####
		# ID: IDN
		for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    idn = i.text.strip()
		    break
		# Log
		if idn is not None:
		    log.info("Extracted ID IDN: %s" % idn)


		##### Field 24 #####
		# ID: URN
		for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    urn = i.text.strip()
		    break

		# Log
		if urn is not None:
		    log.info("Extracted ID URN: %s" % urn)


		##### Field 20 #####
		# ID: ISBN
		for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
		    match = re.search(isbn_regex, i.text.strip())
		    isbn = match.group()
		    isbn = isbn.replace('-','')
		    break

		# Log
		if isbn is not None:
		    log.info("Extracted ID ISBN: %s" % isbn)

		# When doing an exact search for a given ISBN skip books with wrong ISBNs
		if isbn is not None and "isbn" in exact_search:
		    if isbn != exact_search["isbn"]:
			log.info("Extracted ISBN does not match book's ISBN, skipping record")
			continue


		##### Field 82 #####
		# ID: Sachgruppe (DDC)
		for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    ddc.append(i.text.strip())

		# Log
		if len(ddc)>0:
		    log.info("Extracted ID DDC: %s" % ",".join(ddc))


		##### Field 490 #####
		# In theory this field is not used for "real" book series, use field 830 instead. But it is used.
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a
			attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			parts = re.split(" : ",attr_v)
			if len(parts)==2:
			    if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
				# figure out which part contains the index
				if bool(re.search("\d",parts[0])):
				    indexpart = parts[0]
				    textpart = parts[1]
				else:
				    indexpart = parts[1]
				    textpart = parts[0]

				match = re.search("(\d+[,\.\d+]?)", indexpart)
				if match is not None:
				    series_index = match.group(1)
				    series = textpart.strip()

			else:
			    match = re.search("(\d+[,\.\d+]?)", attr_v)
			    if match is not None:
				series_index = match.group(1)
			    else:
				series_index = "0"

			series_index = series_index.replace(',','.')

			# Use Series Name from attribute "a" if not already found in attribute "v"
			if series is None:
			    series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 490: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 490: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 246 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip())
			if match is not None:
			    series = match.group(1)
			    series_index = match.group(2)

			    # Log
			    if series_index is not None:
				log.info("Extracted Series Index from Field 246: %s" % series_index)
			    if series is not None:
				log.info("Extracted Series from Field 246: %s" % series)
				series = self.cleanUpSeries(log, series, publisher_name)
			    if series is not None:
				break

		##### Field 800 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 800: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 800: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 830 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 830: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 830: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 689 #####
		# GND Subjects
		for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    subjects_gnd.append(i.text.strip())
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			if i.text.startswith("("):
			    continue
			subjects_gnd.append(i.text)

		# Log
		if len(subjects_gnd)>0:
		    log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))


		##### Fields 600-655 #####
		# Non-GND subjects
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			# ignore entries starting with "(":
			if i.text.startswith("("):
			    continue
			subjects_non_gnd.extend(re.split(',|;',i.text))
		# remove one-character subjects:
		for i in subjects_non_gnd:
		    if len(i)<2:
			subjects_non_gnd.remove(i)

		# Log
		if len(subjects_non_gnd)>0:
		    log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd))


		##### Field 250 #####
		# Edition
		for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    edition = i.text.strip()
		    break

		# Log
		if edition is not None:
		    log.info("Extracted Edition: %s" % edition)


		##### Field 41 #####
		# Languages
		for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    languages.append(i.text.strip())

		# Log
		if languages is not None:
		    log.info("Extracted Languages: %s" % ",".join(languages))


		##### If configured: Try to separate Series, Series Index and Title from the fetched title #####
		#if self.cfg_guess_series is True:
		if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True:
		    guessed_series = None
		    guessed_series_index = None
		    guessed_title = None

		    log.info("Starting Series Guesser")

		    parts = re.split("[:]",self.removeSortingCharacters(title))

		    if len(parts)==2:
			log.info("Title has two parts")
			# make sure only one part of the two parts contains digits
			if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
			    log.info("only one title part contains digits")
			    # figure out which part contains the index
			    if bool(re.search("\d",parts[0])):
				indexpart = parts[0]
				textpart = parts[1]
			    else:
				indexpart = parts[1]
				textpart = parts[0]

			    # Look at the part without digits:
			    match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart)	# remove odd characters from start and end of the text part
			    if match:
				textpart = match.group(1)

			    # Look at the part with digits:
			    # for Titleparts like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart)
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				if guessed_series is None:
				    guessed_series = textpart
				    guessed_title = textpart + " : Band " + guessed_series_index
				else:
				    guessed_title = textpart

				#log.info("ALGO1: guessed_title: " + guessed_title)
				#log.info("ALGO1: guessed_series: " + guessed_series)
				#log.info("ALGO1: guessed_series_index: " + guessed_series_index)

			    else:
				# for Titleparts like: "Episode 2 Name of the series"
				match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart)
				if match:
				    guessed_series_index = match.group(1)
				    guessed_series = match.group(2)

				    if guessed_series is None:
					# sometimes books with multiple volumes are detected as series without name -> Add the volume to the title 
					guessed_series = textpart
					guessed_title = textpart + " : Band " + guessed_series_index
				    else:
					guessed_title = textpart

				    #log.info("ALGO2: guessed_title: " + guessed_title)
				    #log.info("ALGO2: guessed_series: " + guessed_series)
				    #log.info("ALGO2: guessed_series_index: " + guessed_series_index)

				else:
				    # for titleparts like: "Band 2"
				    match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart)
				    if match:
					guessed_series_index = match.group(1)
					# ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE
					# some false positives
					match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart)
					if match:
					    guessed_series = match.group(1)
					    guessed_title = match.group(2)

					    log.info("ALGO3: guessed_title: " + guessed_title)
					    log.info("ALGO3: guessed_series: " + guessed_series)
					    log.info("ALGO3: guessed_series_index: " + guessed_series_index)


		    elif len(parts)==1:
			log.info("Title has one part")
			# for Titles like: "Name of the series - Title (Episode 2)"
			match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			if match:
			    guessed_series_index = match.group(3)
			    guessed_series = match.group(1)
			    guessed_title = match.group(2)

			    #log.info("ALGO4: guessed_title: " + guessed_title)
			    #log.info("ALGO4: guessed_series: " + guessed_series)
			    #log.info("ALGO4: guessed_series_index: " + guessed_series_index)

			else:
			    # for Titles like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				guessed_title = guessed_series + " : Band " + guessed_series_index

				#log.info("ALGO5: guessed_title: " + guessed_title)
				#log.info("ALGO5: guessed_series: " + guessed_series)
				#log.info("ALGO5: guessed_series_index: " + guessed_series_index)

		    # Log
		    if guessed_series is not None:
			log.info("Guessed Series: %s" % guessed_series)
			#guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name)
		    if guessed_series_index is not None:
			log.info("Guessed Series Index: %s" % guessed_series_index)
		    if guessed_title is not None:
			log.info("Guessed Title: %s" % guessed_title)
			guessed_title = self.cleanUpTitle(log, guessed_title)

		    if guessed_series is not None and guessed_series_index is not None and guessed_title is not None:
			title = guessed_title
			series = guessed_series
			series_index = guessed_series_index


		##### Filter exact searches #####
		# When doing an exact search for a given IDN skip books with wrong IDNs
		# TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions
		if idn is not None and "idn" in exact_search:
		    if idn != exact_search["idn"]:
			log.info("Extracted IDN does not match book's IDN, skipping record")
			continue

		##### Put it all together #####
		if self.cfg_append_edition_to_title == True and edition is not None:
		    title = title + " : " + edition

		mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors))
		mi.title_sort = self.removeSortingCharacters(title_sort)
		mi.author_sort = self.removeSortingCharacters(author_sort)
		mi.languages = languages
		mi.pubdate = pubdate
		mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)]))
		mi.series = self.removeSortingCharacters(series)
		mi.series_index = series_index
		mi.comments = comments
		mi.isbn = isbn # also required for cover download
		mi.set_identifier('urn',urn)
		mi.set_identifier('dnb-idn',idn)
		mi.set_identifier('ddc', ",".join(ddc))

		# cfg_subjects:
		# 0: use only subjects_gnd
		if self.cfg_fetch_subjects == 0:
		    mi.tags = self.uniq(subjects_gnd)
		# 1: use only subjects_gnd if found, else subjects_non_gnd
		elif self.cfg_fetch_subjects == 1:
		    if len(subjects_gnd)>0:
			mi.tags = self.uniq(subjects_gnd)
		    else:
			mi.tags = self.uniq(subjects_non_gnd)
		# 2: subjects_gnd and subjects_non_gnd
		elif self.cfg_fetch_subjects == 2:
		    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
		# 3: use only subjects_non_gnd if found, else subjects_gnd
		elif self.cfg_fetch_subjects == 3:
		    if len(subjects_non_gnd)>0:
			mi.tags = self.uniq(subjects_non_gnd)
		    else:
			mi.tags = self.uniq(subjects_gnd)
		# 4: use only subjects_non_gnd
		elif self.cfg_fetch_subjects == 4:
		    mi.tags = self.uniq(subjects_non_gnd)
		# 5: use no subjects at all
		elif self.cfg_fetch_subjects == 5:
		    mi.tags = []

		# put current result's metdata into result queue
		log.info("Final formatted result: \n%s" % mi)
		result_queue.put(mi)
Beispiel #35
0
    def parse_details(self, raw, root):
        #解析元数据各字段数据
        #self.log.info("=====")
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r' % self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(asin or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', asin, 'saved in', f.name)
        # 分析取得书名
        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None
        #分析取得作者
        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (asin, title, authors))
            return
        #以书名,作者为元数据对象mi,用于设置元数据
        mi = Metadata(title, authors)
        #设置Bookid
        idtype = '17k'
        mi.set_identifier(idtype, asin)
        self.k17k_id = asin

        #设备注释(简介)
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
        #设置丛书系列
        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
        #设置标签
        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        #设置最后更新日期
#        try:
#            mi.last_modified = self.parse_last_modified(root)
#        except:
#            self.log.exception('Error parsing last_modified for url: %r'%self.url)
#设置封面
        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        mi.has_cover = bool(self.cover_url)
        mi.source_relevance = self.relevance
        mi.languages = [
            u'中文',
        ]

        if self.k17k_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.k17k_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #36
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title.startswith(r'\376\377'):
        # corrupted XMP packet generated by Nitro PDF. See
        # https://bugs.launchpad.net/calibre/+bug/1541981
        raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:'+x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #37
0
def read_metadata_kfx(stream, read_cover=True):
    ' Read the metadata.kfx file that is found in the sdr book folder for KFX files '
    c = Container(stream.read())
    m = extract_metadata(c.decode())

    # dump_metadata(m)

    def has(x):
        return m[x] and m[x][0]

    def get(x, single=True):
        ans = m[x]
        if single:
            ans = clean_xml_chars(ans[0]) if ans else ''
        else:
            ans = [clean_xml_chars(y) for y in ans]
        return ans

    title = get('title') or _('Unknown')
    authors = get('author', False) or [_('Unknown')]
    auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$')

    def fix_author(x):
        if tweaks['author_sort_copy_method'] != 'copy':
            m = auth_pat.match(x.strip())
            if m is not None:
                return m.group(2) + ' ' + m.group(1)
        return x

    unique_authors = []  # remove duplicates while retaining order
    for f in [fix_author(x) for x in authors]:
        if f not in unique_authors:
            unique_authors.append(f)

    mi = Metadata(title, unique_authors)
    if has('author'):
        mi.author_sort = get('author')
    if has('ASIN'):
        mi.set_identifier('mobi-asin', get('ASIN'))
    elif has('content_id'):
        mi.set_identifier('mobi-asin', get('content_id'))
    if has('languages'):
        langs = list(
            filter(None,
                   (canonicalize_lang(x) for x in get('languages', False))))
        if langs:
            mi.languages = langs
    if has('issue_date'):
        try:
            mi.pubdate = parse_only_date(get('issue_date'))
        except Exception:
            pass
    if has('publisher') and get('publisher') != 'Unknown':
        mi.publisher = get('publisher')
    if read_cover and m[COVER_KEY]:
        try:
            data = base64.standard_b64decode(m[COVER_KEY])
            fmt, w, h = identify(bytes(data))
        except Exception:
            w, h, fmt = 0, 0, None
        if fmt and w > -1 and h > -1:
            mi.cover_data = (fmt, data)

    return mi
Beispiel #38
0
    def parse_details(self, root):
        try:
            antik_id = self.parse_antik_id(root)
            self.log.info('Parsed Antikvarium identifier: %s' % antik_id)
        except:
            self.log.exception('Error parsing Antikvarium id for url: %r' %
                               self.url)
            antik_id = None

        try:
            title = self.parse_title(root)
            self.log.info('Parsed title: %s' % title)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
            self.log.info('Parsed authors: %s' % authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not antik_id:
            self.log.error(
                'Could not find title/authors/Antikvarium.hu id for %r' %
                self.url)
            self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' %
                           (antik_id, title, authors))
            return

        mi = Metadata(title, authors)
        mi.set_identifier('antik_hu', antik_id)
        self.antik_id = antik_id

        try:
            isbn = self.parse_isbn(root)
            self.log.info('Parsed ISBN: %s' % isbn)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            series = self.parse_series(root)
            self.log.info('Parsed series: %s' % series)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            series = None

        try:
            mi.series_index = self.parse_series_index(root)
            self.log.info('Parsed series index: %s' % mi.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            mi.series_index = None

        try:
            mi.comments = self.parse_comments(root)
            self.log.info('Parsed comments: %s' % mi.comments)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
            self.log.info('Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.antik_id,
                                                      self.cover_url)
            mi.has_cover = bool(self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        try:
            mi.publisher = self.parse_publisher(root)
            self.log.info('Parsed publisher: %s' % mi.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            mi.tags = self.parse_tags(root)
            self.log.info('Parsed tags: %s' % mi.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
            self.log.info('Parsed publication date: %s' % mi.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        try:
            mi.languages = self.parse_languages(root)
            self.log.info('Parsed languages: %r' % mi.languages)
        except:
            self.log.exception('Error parsing languages for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        if series:
            mi.series = series

        if self.antik_id and self.isbn:
            self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)