Python Metadata.isbn Examples, calibre.ebooks.metadata.book.base.Metadata.isbn Python Examples

Example #1

0

Show file

File: baike.py Project: talebook/calibre-webserver

    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join( "%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors   = [ info.get(u'作者', u'佚名') ]
        mi.author_sort = mi.authors[0]
        mi.isbn      = BAIKE_ISBN
        mi.tags      = baike.get_tags()
        mi.pubdate   = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments  = re.sub(r'\[\d+\]$', "", baike.get_summary() )
        mi.website   = baike.http.url
        mi.source    = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except: pass
        return mi

Example #2

0

Show file

File: douban.py Project: nozuono/calibre-webserver

    def get_metadata(self, md):
        book = None
        if md.isbn:
            book = self.get_book_by_isbn(md.isbn)
        if not book:
            book = self.get_book_by_title(md.title)
        mi = Metadata(book['title'])
        mi.authors     = book['author']
        mi.author_sort = mi.authors[0] if mi.authors else None
        if mi.author_sort:
            for r in REMOVES:
                mi.author_sort = r.sub("", mi.author_sort)
            mi.authors[0] = mi.author_sort
        mi.publisher   = book['publisher']
        mi.comments    = book['summary']
        mi.isbn        = book.get('isbn13', None)
        mi.tags        = [ t['name'] for t in book['tags'] ][:8]
        mi.rating      = int(float(book['rating']['average']))
        mi.pubdate     = self.str2date(book['pubdate'])
        mi.timestamp   = datetime.datetime.now()
        mi.douban_id   = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)

        img_url = book['images']['large']
        img_fmt = img_url.split(".")[-1]
        img = StringIO(urlopen(img_url).read())
        mi.cover_data = (img_fmt, img)
        logging.error("=================\ndouban metadata:\n%s" % mi)
        return mi

Example #3

0

Show file

File: douban.py Project: revir/my-calibre-server

    def _metadata(self, book):
        authors = []
        if book['author']:
            for author in book['author']:
                for r in REMOVES:
                    author = r.sub("", author)
                authors.append(author)
        if not authors: authors = [u'佚名']

        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO
        mi = Metadata(book['title'])
        mi.authors = authors
        mi.author_sort = mi.authors[0]
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)
        mi.website = "https://book.douban.com/isbn/%s" % mi.isbn
        mi.source = u'豆瓣'

        mi.cover_url = book['images']['large']
        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        logging.debug("=================\ndouban metadata:\n%s" % mi)
        return mi

Example #4

0

Show file

File: worker.py Project: Marduke/CalimeplPacz

    def parse(self, xml_detail):
        data = xml_detail.split('\n')[1].split("|")
        self.log(data)

        title = data[1]
        authors = [data[0]]
        comments = data[13]
        isbn = data[3]
        publisher = data[6]
        pub_date_tmp = data[34].split('-')
        pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz)
        if isbn is not None:
            isbn_tmp = re.sub("-", "", isbn)
            cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp)
        else:
            cover = None

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.publisher = publisher
            mi.pubdate = pub_date
            mi.isbn = isbn
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            return None

Example #5

0

Show file

    def get_metadata(self, md, select):
        book = None
        if md.isbn:
            book = self.get_book_by_isbn(md.isbn)
        if not book:
            book = self.get_book_by_title(md.title, md.author_sort, select)
        if not book:
            return None
        mi = Metadata(book['title'])
        mi.authors = book['author']
        mi.author_sort = mi.authors[0] if mi.authors else None
        if mi.author_sort:
            for r in REMOVES:
                mi.author_sort = r.sub("", mi.author_sort)
            mi.authors[0] = mi.author_sort
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)

        img_url = book['images']['large']
        img_fmt = img_url.split(".")[-1]
        img = StringIO(urlopen(img_url).read())
        mi.cover_data = (img_fmt, img)
        #logging.error("=================\ndouban metadata:\n%s" % mi)
        return mi

Example #6

0

Show file

File: tools.py Project: talebook/calibre-webserver

    def get_baike_metadata(self, title):
        from baidubaike import Page

        try: baike = Page(title)
        except: return None

        info = baike.get_info()
        print "\n".join( "%s:\t%s" % v for v in info.items())
        mi = Metadata(info['title'])
        plat = info.get(u'首发网站', None)
        if not plat:
            plat = info.get(u'首发状态', "网络小说平台")
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors   = [info[u'作者']]
        mi.isbn      = '0000000000001'
        mi.tags      = baike.get_tags()
        mi.pubdate   = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.comments  = baike.get_summary()
        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except: pass
        return mi

Example #7

0

Show file

File: worker.py Project: Marduke/CalimeplPacz

    def parse(self, xml_detail, xml_more_info):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_more_info)
        publisher = self.parse_publisher(xml_detail)
        tags = self.parse_tags(xml_detail, xml_more_info)
        serie, serie_index = self.parse_serie(xml_detail)
        pub_year = self.parse_pub_year(xml_detail, xml_more_info)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(as_unicode(title), authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            self.log('Result skipped for because title or authors not found')
            return None

Example #8

0

Show file

File: worker.py Project: Marduke/CalimeplPacz

    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_detail)
        publisher = self.parse_publisher(xml_detail)
        pub_year = self.parse_pubdate(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:str(self.number)}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(str(self.number), cover)

            return mi
        else:
            return None

Example #9

0

Show file

File: worker.py Project: Marduke/CalimeplPacz

    def parse(self, xml_detail):
        sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None
        authors = []
        tags = []
        xpath = self.XPath('//table[@id="record"]//tr')
        for row in xpath(xml_detail):
            ch = row.getchildren()
            txt = ch[0].text.strip()
            data = self.normalize(ch[1].text)
            if txt.startswith('245') and title is None:
                title = self.parse_title(data)
            if txt.startswith('246'):
                title = self.parse_title(data)
            elif txt.startswith('100') or txt.startswith('700'):
                res = self.parse_author(data)
                if res is not None:
                    authors.append(res)
            elif txt == 'SYS':
                sys_ident = data.strip()
            elif txt =='020':
                isbn = self.parse_isbn(data)
            elif txt == '260':
                publisher, pub_year = self.parse_publisher(data)
            elif txt.startswith('490') and serie is None:
                serie, serie_index = self.parse_serie(data)
            elif txt == '655 7':
                tags.append(self.parse_tags(data))

        if isbn is not None and isbn != '':
            cover = self.parse_cover(isbn)

        if title is not None and len(authors) > 0 and sys_ident is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.identifiers = {self.plugin.name:sys_ident}
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(sys_ident, cover)

            return mi
        else:
            self.log('Data not found')
            return None

Example #10

0

Show file

    def parse_response(cls, response, isbn_initial, log):
        metadata_items = []

        page_soup = BeautifulSoup(response.text)

        for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1):

            title = cls.find(candidate, 'b-result__name-wrap', True)
            author = map(
                unicode.strip,
                cls.find(candidate, 'b-result__author', True).split(','))
            comments = cls.find(candidate, 'b-result__desc__full',
                                True).replace(u'Скрыть', '').strip()
            isbn = cls.find(candidate, 'b-result__isbn',
                            True).split(':')[-1].split(',')[0].strip()

            log.info(u'Found candidate %s: %s' % (idx, title))

            publisher = None
            pubdate = None

            other_info = cls.find(candidate, 'b-result__years', True).strip()
            if other_info:
                for entry in other_info.split(';'):
                    k, v = entry.split(':', 1)
                    k = k.strip()
                    if k == u'Год':
                        pubdate = parse_only_date('1.1.%s' %
                                                  v.split(',')[0].strip())
                    elif k == u'Издательство':
                        publisher = v.strip()

            metadata_item = Metadata(title, author)
            metadata_item.isbn = isbn or isbn_initial

            if comments:
                metadata_item.comments = comments

            if publisher is not None:
                metadata_item.publisher = publisher

            if pubdate is not None:
                metadata_item.pubdate = pubdate

            metadata_items.append(metadata_item)

        return metadata_items

Example #11

0

Show file

File: __init__.py Project: driftcrow/calibre-douban-proxy

    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.date import parse_date, utcnow
        from calibre.utils.cleantext import clean_ascii_chars

        # log.info('entry_ is: ',entry_)
        id_url = entry_['url']
        douban_id = entry_['id']
        title_ = entry_['title']
        subtitle = entry_['subtitle']
        authors = [x.strip() for x in entry_['author'] if x]
        if not authors:
            authors = [_('Unknown')]

        mi = Metadata(title_, authors)
        mi.identifiers = {'douban': douban_id}
        mi.comments = entry_['summary']
        mi.publisher = entry_['publisher']

        # ISBN
        mi.isbn = entry_['isbn10']
        mi.all_isbns = [entry_['isbn10'], entry_['isbn13']]

        # Tags
        mi.tags = [x['name'].strip() for x in entry_['tags']]

        # pubdate
        pubdate = entry_['pubdate']
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        mi.rating = float(entry_['rating']['average']) / 2.0

        # Cover
        mi.has_douban_cover = entry_['image']
        return mi

Example #12

0

Show file

File: edelweiss.py Project: WilliamRJohns/glacier.io

    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import UNDEFINED_DATE
        root = parse_html(raw)
        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])

        # Identifiers
        if self.basic_data['isbns']:
            mi.isbn = self.basic_data['isbns'][0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        if self.basic_data['tags']:
            mi.tags = self.basic_data['tags']
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        mi.publisher = self.basic_data['publisher']

        # Pubdate
        if self.basic_data['pubdate'] and self.basic_data[
                'pubdate'].year != UNDEFINED_DATE:
            mi.pubdate = self.basic_data['pubdate']

        # Rating
        if self.basic_data['rating']:
            mi.rating = self.basic_data['rating']

        # Comments
        comments = ''
        for cid in ('summary', 'contributorbio', 'quotes_reviews'):
            cid = 'desc_{}{}-content'.format(cid, self.sku)
            div = root.xpath('//*[@id="{}"]'.format(cid))
            if div:
                comments += self.render_comments(div[0])
        if comments:
            mi.comments = comments

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None
        return mi

Example #13

0

Show file

File: __init__.py Project: idlesign/calibre-bookradar

    def parse_response(cls, response, isbn_initial, log):
        metadata_items = []

        page_soup = BeautifulSoup(response.text)

        for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1):

            title = cls.find(candidate, 'b-result__name-wrap', True)
            author = map(unicode.strip, cls.find(candidate, 'b-result__author', True).split(','))
            comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip()
            isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip()

            log.info(u'Found candidate %s: %s' % (idx, title))

            publisher = None
            pubdate = None

            other_info = cls.find(candidate, 'b-result__years', True).strip()
            if other_info:
                for entry in other_info.split(';'):
                    k, v = entry.split(':', 1)
                    k = k.strip()
                    if k == u'Год':
                        pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip())
                    elif k == u'Издательство':
                        publisher = v.strip()

            metadata_item = Metadata(title, author)
            metadata_item.isbn = isbn or isbn_initial

            if comments:
                metadata_item.comments = comments

            if publisher is not None:
                metadata_item.publisher = publisher

            if pubdate is not None:
                metadata_item.pubdate = pubdate

            metadata_items.append(metadata_item)

        return metadata_items

Example #14

0

Show file

File: edelweiss.py Project: JimmXinu/calibre

    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import UNDEFINED_DATE
        root = parse_html(raw)
        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])

        # Identifiers
        if self.basic_data['isbns']:
            mi.isbn = self.basic_data['isbns'][0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        if self.basic_data['tags']:
            mi.tags = self.basic_data['tags']
            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]

        # Publisher
        mi.publisher = self.basic_data['publisher']

        # Pubdate
        if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:
            mi.pubdate = self.basic_data['pubdate']

        # Rating
        if self.basic_data['rating']:
            mi.rating = self.basic_data['rating']

        # Comments
        comments = ''
        for cid in ('summary', 'contributorbio', 'quotes_reviews'):
            cid = 'desc_{}{}-content'.format(cid, self.sku)
            div = root.xpath('//*[@id="{}"]'.format(cid))
            if div:
                comments += self.render_comments(div[0])
        if comments:
            mi.comments = comments

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None
        return mi

Example #15

0

Show file

    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata

        info = baike.get_info()
        logging.debug("\n".join("%s:\t%s" % v for v in info.items()))

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'
        mi.provider_key = KEY
        mi.provider_value = baike.get_id()

        if self.copy_image and mi.cover_url:
            logging.debug("fetching cover: %s", mi.cover_url)
            img = io.BytesIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi

Example #16

0

Show file

File: action.py Project: outrera/calibre-extract-isbn

    def _check_proceed_with_extracted_isbns(self, payload):
        extracted_ids, _same_isbn_ids, _failed_ids = payload
        modified = set()
        db = self.gui.current_db

        for i, title, last_modified, isbn in extracted_ids:
            lm = db.metadata_last_modified(i, index_is_id=True)
            if lm > last_modified:
                title = db.title(i, index_is_id=True)
                authors = db.authors(i, index_is_id=True)
                if authors:
                    authors = [x.replace('|', ',') for x in authors.split(',')]
                    title += ' - ' + authors_to_string(authors)
                modified.add(title)

        if modified:
            from calibre.utils.icu import lower

            modified = sorted(modified, key=lower)
            if not question_dialog(self.gui, _('Some books changed'), '<p>'+
                    _('The metadata for some books in your library has'
                        ' changed since you started the download. If you'
                        ' proceed, some of those changes may be overwritten. '
                        'Click "Show details" to see the list of changed books. '
                        'Do you want to proceed?'), det_msg='\n'.join(modified)):
                return
        # At this point we want to re-use code in edit_metadata to go ahead and
        # apply the changes. So we will replace the Metadata objects with some
        # empty ones with only the isbn field set so only that field gets updated
        id_map = {}
        for i, title, last_modified, isbn in extracted_ids:
            mi = Metadata(_('Unknown'))
            mi.isbn = isbn
            id_map[i] = mi
        edit_metadata_action = self.gui.iactions['Edit Metadata']
        edit_metadata_action.apply_metadata_changes(id_map,
                                                    callback=self._mark_and_display_results)

Example #17

0

Show file

File: baike.py Project: eddy1842/ebook-web

    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join("%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "網絡小說平台"
        plat = info.get(u'首發狀態', plat)
        plat = info.get(u'首發網站', plat)
        plat = plat.replace(u'首發', '')
        mi.publisher = info.get(u'連載平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完結' in info.get(u'連載狀態', ""):
            day = re.findall('\d*-\d*-\d*', info[u'連載狀態'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi

Example #18

0

Show file

File: douban.py Project: MarioJC/calibre

def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    title          = XPath('descendant::atom:title')
    description    = XPath('descendant::atom:summary')
    publisher      = XPath("descendant::db:attribute[@name='publisher']")
    isbn           = XPath("descendant::db:attribute[@name='isbn13']")
    date           = XPath("descendant::db:attribute[@name='pubdate']")
    creator        = XPath("descendant::db:attribute[@name='author']")
    booktag        = XPath("descendant::db:tag/attribute::name")
    rating         = XPath("descendant::gd:rating/attribute::average")
    cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban':douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/')
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi

Example #19

0

Show file

File: __init__.py Project: ymfsing/Tools

    def to_metadata(self, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        log.info("to_metadata")
        douban_id = entry_.get("id")
        title = entry_.get("title")
        description = entry_.get("summary")
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get("publisher")
        isbn = entry_.get("isbn13")  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get("pubdate")
        authors = entry_.get("author")
        # authors = "author"
        book_tags = entry_.get("tags")
        rating = entry_.get("rating")
        cover_url = entry_.get("cover")
        series = entry_.get("series")

        if not authors:
            authors = [("Unknown")]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {"douban": douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(""), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = book_tags

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except BaseException:
                log.error("Failed to parse pubdate %r" % pubdate)

        if rating:
            try:
                # mi.publisher += "#PrB.rating#" + str(rating)
                mi.rating = rating / 2.0
            except BaseException:
                log.exception("Failed to parse rating")
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find("book-default") == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series

        return mi

Example #20

0

Show file

File: douban.py Project: wpxdong/calibre

def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry = XPath('//atom:entry')
    entry_id = XPath('descendant::atom:id')
    title = XPath('descendant::atom:title')
    description = XPath('descendant::atom:summary')
    publisher = XPath("descendant::db:attribute[@name='publisher']")
    isbn = XPath("descendant::db:attribute[@name='isbn13']")
    date = XPath("descendant::db:attribute[@name='pubdate']")
    creator = XPath("descendant::db:attribute[@name='author']")
    booktag = XPath("descendant::db:tag/attribute::name")
    rating = XPath("descendant::gd:rating/attribute::average")
    cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban': douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(
            xml_to_unicode(clean_ascii_chars(raw),
                           strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r' % pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/')
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi

Example #21

0

Show file

    def parse(self, raw, desc_raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_date, utcnow
        import json

        root = parse_html(raw.decode('gb18030'))
        title = root.xpath('//*[@id="name"]/div[1]/text()')
        title = title[0].strip()
        authors = []
        for i in root.xpath('//*[@id="p-author"]/a'):
            authors.append(i.text.strip())
        mi = Metadata(title, authors)

        information = root.xpath('//*[@id="parameter2"]/li')
        info = dict()
        for i in information:
            tmp = etree.tostring(i, method='text',
                                 encoding='utf-8').split(u'：')
            info[tmp[0].strip()] = tmp[1].strip()
        # Identifiers
        mi.identifiers = self.plugin.identifiers
        mi.identifiers['jd'] = self.sku
        isbn = info['ISBN']
        self.log.error(isbn)
        if isbn:
            mi.isbn = isbn
            self.plugin.cache_isbn_to_identifier(isbn, self.sku)
            mi.identifiers['isbn'] = isbn

        # Publisher
        mi.publisher = info.get(u'出版社')

        # Pubdate
        pubdate = info.get(u'出版时间')
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                self.log.error('Failed to parse pubdate %r' % pubdate)

        # Series
        mi.series = info.get(u'丛书名')

        img = root.xpath('//*[@id="spec-n1"]/img')
        cover = img[0].get('src')
        if cover:
            if not cover.startswith('http'):
                cover = 'https:' + cover
            self.plugin.cache_identifier_to_cover_url(self.sku, cover)
        self.log.error(cover)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        # Comments
        # showdesc({"date":1583588455348,"content":" ... "})
        try:
            desc = json.loads(desc_raw[9:-1].decode('gb18030'))
            desc_root = parse_html(desc['content'])
            div = desc_root.xpath(
                '//*[@id="detail-tag-id-3"]/div[2]/div/text()')

            comments = div[0]
            mi.comments = comments
        finally:
            return mi

Example #22

0

Show file

File: __init__.py Project: SamLangTen/CalibreMetadataBooksComTw

    def retrieve_bokelai_detail(self, bokelai_id, log, result_queue, timeout):

        detail_url = self.BOKELAI_DETAIL_URL % bokelai_id
        log.info(detail_url)

        try:
            br = self.browser
            _raw = br.open_novisit(detail_url, timeout=timeout)
            raw = _raw.read()
        except Exception as e:
            log.exception('Failed to load detail page: %s' % detail_url)
            return

        root = etree.HTML(raw)
        info_json_text = root.xpath(
            "//script[@type='application/ld+json']")[0].text
        log.info(info_json_text)
        info_json = json.loads(info_json_text)

        title = info_json['name']
        authors = info_json['author'][0]['name'].split(",")
        publisher = info_json['publisher'][0]['name']
        isbn = info_json['workExample']['workExample']['isbn']
        pubdate = info_json['datePublished']

        comments = ""
        comments_ele = root.xpath("(//div[@class='content'])[1]//text()")
        comments = "\n".join(comments_ele)

        tags = list()
        for ele in root.xpath("//li[contains(text(),'本書分類：')]/a"):
            log.info(ele.text)
            if "／" in ele.text:
                tags.extend(ele.text.split("／"))
            if "/" in ele.text:
                tags.extend(ele.text.split("/"))
            else:
                tags.append(ele.text)

        cover_url = re.search(r'https[^\?\=\&]*' + bokelai_id + r'[^\?\=\&]*',
                              info_json['image']).group(0)

        if not authors:
            authors = [_('Unknown')]

        log.info(title, authors, publisher, isbn, pubdate, comments, tags,
                 cover_url)

        mi = Metadata(title, authors)
        mi.identifiers = {'bokelai': bokelai_id, 'isbn': isbn}
        mi.publisher = publisher
        mi.comments = comments
        mi.isbn = isbn
        mi.tags = tags
        if pubdate:
            try:
                from calibre.utils.date import parse_date, utcnow
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        if not cover_url is None:
            mi.has_bokelai_cover = cover_url
            self.cache_identifier_to_cover_url(mi.identifiers['bokelai'],
                                               mi.has_bokelai_cover)
        else:
            mi.has_bokelai_cover = None

        result_queue.put(mi)

Example #23

0

Show file

    def parse_details(self, root):
        try:
            legie_id = self.parse_legie_id(self.url)
        except:
            self.log.exception('Error parsing Legie id for url: %r' % self.url)
            legie_id = None

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not legie_id:
            self.log.error('Could not find title/authors/Legie id for %r' %
                           self.url)
            self.log.error('Legie: %r Title: %r Authors: %r' %
                           (legie_id, title, authors))
            return

        self.legie_id = legie_id

        rating = comments = series = series_index = None
        try:
            rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            (series, series_index) = self.parse_series(root)
        except:
            self.log.info('Series not found.')

        try:
            tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)
            tags = None

        if legie_id:
            editions = self.get_editions()

            if editions:
                num_editions = len(editions)
                self.log.info('Nalezeno %d vydani' % num_editions)
                for edition in editions:
                    (year, cover_url, publisher, isbn) = edition
                    mi = Metadata(title, authors)
                    self.legie_id = "%s#%s" % (legie_id, year)
                    mi.set_identifier('legie', self.legie_id)
                    mi.source_relevance = self.relevance
                    mi.rating = rating
                    mi.comments = comments
                    mi.series = series
                    mi.series_index = series_index
                    if cover_url:
                        mi.cover_url = self.cover_url = cover_url
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
                    if tags:
                        mi.tags = tags
                    mi.has_cover = bool(self.cover_url)
                    mi.publisher = publisher
                    mi.isbn = isbn
                    mi.pubdate = self.prepare_date(int(year))
                    mi.language = "ces"
                    self.result_queue.put(mi)
            else:
                mi = Metadata(title, authors)
                mi.set_identifier('legie', self.legie_id)
                mi.source_relevance = self.relevance
                mi.rating = rating
                mi.comments = comments
                mi.series = series
                mi.series_index = series_index
                try:
                    self.cover_url = self.parse_cover(root)
                except:
                    self.log.exception('Error parsing cover for url: %r' %
                                       self.url)
                if tags:
                    mi.tags = tags
                mi.has_cover = bool(self.cover_url)
                mi.publisher = publisher
                mi.isbn = isbn
                mi.pubdate = self.prepare_date(int(year))
                mi.language = "ces"
                self.result_queue.put(mi)
                if self.legie_id:
                    if self.cover_url:
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)

Example #24

0

Show file

File: edelweiss.py Project: Riva3000/calibre

    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r'%pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None

        return mi

Example #25

0

Show file

    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the title of the book
        try:
            title_node = root.xpath('//span[@itemprop="name"]')
            self.title = title_node[0].text
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath('//span[@class="expandAuthorName"]')
            author_strings = author_node[0].text.split(",")
            #print(author_strings)
            for name in author_strings:
                self.authors.append(name)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Get the series of the book
        try:
            series_node = root.xpath('//b[contains(text(), "Serie")]/a')
            if len(series_node) > 0:
                self.series = series_node[0].text.split(": ")[0].strip()
                self.series_index = series_node[0].text.split(": ")[-1].strip()
            #    print("'%s'" % self.series)
            #    print("'%s'" % self.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        # Some books have ratings, let's use them.
        try:
            self.rating = 0.0
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            isbn_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]'
            )
            if len(isbn_node) > 0:
                self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip()
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            comment_node = root.xpath('//meta[@name="description"]/@content')
            self.comments = comment_node[0]
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            cover_node = root.xpath(
                '//div[@class="bookDetailCoverCover"]/img/@src')
            self.cover_url = "https://mofibo.com" + cover_node[0]
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            publisher_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]')
            if len(publisher_node) > 0:
                self.publisher = publisher_node[0].text
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language_node = root.xpath('//b[@class="expanderLanguage"]')
            language = language_node[0].text.strip().replace("Sprog:",
                                                             "").replace(
                                                                 " ", "")
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            pubdate_node = root.xpath(
                '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]'
            )
            if len(pubdate_node) > 0:
                date_str = pubdate_node[0].text.replace("Udgivet:", "").strip()
                format_str = '%Y-%m-%d'  # The format
                self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Get the tags
        try:
            tags = []
            tags_node = root.xpath('//span[@itemprop="category"]')
            tags.append(tags_node[0].text.strip())
            self.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('mofibo', self.url)

        # Set rating
        if self.series:
            try:
                meta_data.series = self.series
                meta_data.series_index = self.series_index
            except:
                self.log.exception('Error loading series')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')
        # Set tags data
        if self.tags:
            try:
                meta_data.tags = self.tags
            except:
                self.log.exception('Error loading tags')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)

Example #26

0

Show file

File: __init__.py Project: nerrixDE/calibre-dnb

    def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30):
	self.load_config()

	if authors is None:
	    authors=[]

	# get identifying tags from book
	idn = identifiers.get('dnb-idn', None)
	isbn = check_isbn(identifiers.get('isbn', None))

	# ignore unknown authors
	ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ]
	for i in ignored_authors:
	    authors = [ x for x in authors if x != i ]

	if (isbn is None) and (idn is None) and (title is None) and (authors is None):
	    log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).")
	    return None


	queries=[]
	# DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
	exact_search = {}

	if idn is not None:
	    exact_search['idn'] = idn
	    # in case look for a IDN only search for the IDN and skip all the other stuff
	    queries.append('num='+idn)

	else:
	    authors_v = []
	    title_v = []

	    # create some variants of given authors
	    if authors != []:
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False)))	# concat all author names ("Peter Meier Luise Stark")
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True)))	# use only first author
		for a in authors:
		    authors_v.append(a)	# use all authors, one by one

		# remove duplicates
		unique_authors_v = []
		for i in authors_v:
		    if i not in unique_authors_v:
			unique_authors_v.append(i)


	    # create some variants of given title
	    if title is not None:
		title_v.append(title)	# simply use given title
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False)))	# remove some punctation characters
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True)))	# remove subtitle (everything after " : ")
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False)))	# remove some punctation characters and joiners ("and", "&", ...)
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)))	# remove subtitle (everything after " : ") and joiners ("and", "&", ...)
		# TODO: remove subtitle after " - "

		# remove duplicates
		unique_title_v = []
		for i in title_v:
		    if i not in unique_title_v:
			unique_title_v.append(i)


	    # title and author
	    if authors_v != [] and title_v != []:
		for a in authors_v:
		    for t in title_v:
			if isbn is not None:
			    queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"')
			else:
			    queries.append('tit="' + t + '" AND per="' + a + '"')

		# try with first author as title and title (without subtitle) as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')

		# try with author and title (without subtitle) in any index
		if isbn is not None:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # author but no title
	    elif authors_v != [] and title_v == []:
		for i in authors_v:
		    if isbn is not None:
			queries.append('per="'+ i +'" AND num="' + isbn + '"')
		    else:
			queries.append('per="'+ i +'"')

		# try with author as title
		if isbn is not None:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # title but no author
	    elif authors_v == [] and title_v != []:
		for i in title_v:
		    if isbn is not None:
			queries.append('tit="' + i + '" AND num="' + isbn + '"')
		    else:
			queries.append('tit="' + i + '"')

		# try with title as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"')


	    # as last resort only use isbn
	    if isbn is not None:
		queries.append('num=' + isbn)


	# remove duplicate queries
	uniqueQueries=[]
	for i in queries:
	    if i not in uniqueQueries:
		uniqueQueries.append(i)


	# Process queries
	results = None

	for query in uniqueQueries:
	    # SRU does not work with "+" or "?" characters in query, so we simply remove them
	    query = re.sub('[\+\?]','',query)

	    query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
	    log.info(query)

	    if self.cfg_dnb_token is None:
		results = self.getSearchResultsByScraping(log, query, timeout)
	    else:
		results = self.getSearchResults(log, query, timeout)

	    if results is None:
		continue

	    log.info("Parsing records")

	    ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' }
	    for record in results:
		series = None
		series_index = None
		publisher = None
		pubdate = None
		languages = []
		title = None
		title_sort = None
		authors = []
		author_sort = None
		edition = None
		comments = None
		idn = None
		urn = None
		isbn = None
		ddc = []
		subjects_gnd = []
		subjects_non_gnd = []
		publisher_name = None
		publisher_location = None


		##### Field 264 #####
		# Publisher Name and Location
		fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns)
		if len(fields)>0:
		    publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		else:
		    fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns)
		    if len(fields)>0:
			publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    else:
			fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns)
			if len(fields)>0:
			    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();

		# Publishing Date
		for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns):
		    match = re.search("(\d{4})", i.text.strip())
		    if match is not None:
			year = match.group(1)
			pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0)
			break

		# Log
		if publisher_name is not None:
		    log.info("Extracted Publisher: %s" % publisher_name)
		if publisher_location is not None:
		    log.info("Extracted Publisher Location: %s" % publisher_location)
		if pubdate is not None:
		    log.info("Extracted Publication Year: %s" % pubdate)


		##### Field 245 ####
		# Title/Series/Series_Index
		title_parts = []
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
		    # if a,n,p,n,p,n,p exist:		series = a + n0 + " - " + p0 + n1 + " - " + p1,	series_index = n2,	title = p2
		    # if a,n,p,n,p exist:		series = a + n0 + " - " + p0, 			series_index = n1,	title = p1	(Example: dnb-id 1008774839)
		    # if a,n,p exist:			series = a,					series_index = n,	title = p
		    # if a exist:												title = a
		    # TODO: a,n,p,n (i.e. 956375146)

		    code_p = []
		    code_n = []
		    code_a = []

		    for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns):
			code_p.append(j.text.strip())

		    for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns):
			match = re.search("(\d+[,\.\d+]?)", j.text.strip())
			if match:
			    code_n.append(match.group(1))
			else:
			    code_n.append("0")	# looks like sometimes DNB does not know the series index and uses something like "[...]"

		    for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns):
			code_a.append(j.text.strip())

		    if len(code_p) == 0:
			title_parts = title_parts + code_a

		    elif len(code_p)>0 and len(code_p) == len(code_n):
			series = " : ".join(code_a)	# I've never seen more than one code_a, but who knows...
			for i in range (0,len(code_p)-1):
			    series = series + " " + code_n[i] + " " + code_p[i]
			series_index = code_n[-1]
			title_parts.append(code_p[-1])


		# subtitle 1: Field 245
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns):
		    title_parts.append(i.text.strip())
		    break
		
		# subtitle 2
		#for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
		#    title = title + " / " + i.text.strip()
		#    break

		title = " : ".join(title_parts)

		# Log
		if series_index is not None:
		    log.info("Extracted Series_Index from Field 245: %s" % series_index)
		if series is not None:
		    log.info("Extracted Series from Field 245: %s" % series)
		    series = self.cleanUpSeries(log, series, publisher_name)
		if title is not None:
		    log.info("Extracted Title: %s" % title)
		    title = self.cleanUpTitle(log, title)

		# Title_Sort
		if len(title_parts)>0:
		    title_sort_parts = list(title_parts)
		    title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0])
		    sortword = title_sort_regex.group(2)
		    if sortword:
			title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword]))
		    title_sort = " : ".join(title_sort_parts)

		# Log
		if title_sort is not None:
		    log.info("Extracted Title_Sort: %s" % title_sort)


		##### Field 100 and Field 700 #####
		# Authors
		for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# primary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		if len(authors)==0:	# if no "real" autor was found take all persons involved
		    for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
			name = re.sub(" \[.*\]$","",i.text.strip());
			authors.append(name)
		if len(authors)>0:
		    author_sort = authors[0]

		# Log
		if len(authors)>0:
		    log.info("Extracted Authors: %s" % " & ".join(authors))
		if author_sort is not None:
		    log.info("Extracted Author_Sort: %s" % " & ".join(authors))


		##### Field 856 #####
		# Comments
		for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
		    if i.text.startswith("http://deposit.dnb.de/"):
			br = self.browser
			log.info('Downloading Comments from: %s' % i.text)
			try:
			    comments = br.open_novisit(i.text, timeout=30).read()
			    comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
			    comments = sanitize_comments_html(comments)
			    break
			except:
			    log.info("Could not download Comments from %s" % i)

		# Log
		if comments is not None:
		    log.info('Comments: %s' % comments)

		# If no comments are found for this edition, look at other editions of this book (Fields 776)
		# TODO: Make this configurable (default: yes)
		if comments is None:
		    # get all other issues
		    for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns):
			other_idn = re.sub("^\(.*\)","",i.text.strip());
			subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
			log.info(subquery)

			if self.cfg_dnb_token is None:
			    subresults = self.getSearchResultsByScraping(log, subquery, timeout)
			else:
			    subresults = self.getSearchResults(log, subquery, timeout)

			if subresults is None:
			    continue

			for subrecord in subresults:
			    for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
				if i.text.startswith("http://deposit.dnb.de/"):
				    br = self.browser
				    log.info('Downloading Comments from: %s' % i.text)
				    try:
					comments = br.open_novisit(i.text, timeout=30).read()
					comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
					comments = sanitize_comments_html(comments)
					break
				    except:
					log.info("Could not download Comments from %s" % i)
			    if comments is not None:
				log.info('Comments from other issue: %s' % comments)
				break


		##### Field 16 #####
		# ID: IDN
		for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    idn = i.text.strip()
		    break
		# Log
		if idn is not None:
		    log.info("Extracted ID IDN: %s" % idn)


		##### Field 24 #####
		# ID: URN
		for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    urn = i.text.strip()
		    break

		# Log
		if urn is not None:
		    log.info("Extracted ID URN: %s" % urn)


		##### Field 20 #####
		# ID: ISBN
		for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
		    match = re.search(isbn_regex, i.text.strip())
		    isbn = match.group()
		    isbn = isbn.replace('-','')
		    break

		# Log
		if isbn is not None:
		    log.info("Extracted ID ISBN: %s" % isbn)

		# When doing an exact search for a given ISBN skip books with wrong ISBNs
		if isbn is not None and "isbn" in exact_search:
		    if isbn != exact_search["isbn"]:
			log.info("Extracted ISBN does not match book's ISBN, skipping record")
			continue


		##### Field 82 #####
		# ID: Sachgruppe (DDC)
		for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    ddc.append(i.text.strip())

		# Log
		if len(ddc)>0:
		    log.info("Extracted ID DDC: %s" % ",".join(ddc))


		##### Field 490 #####
		# In theory this field is not used for "real" book series, use field 830 instead. But it is used.
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a
			attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			parts = re.split(" : ",attr_v)
			if len(parts)==2:
			    if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
				# figure out which part contains the index
				if bool(re.search("\d",parts[0])):
				    indexpart = parts[0]
				    textpart = parts[1]
				else:
				    indexpart = parts[1]
				    textpart = parts[0]

				match = re.search("(\d+[,\.\d+]?)", indexpart)
				if match is not None:
				    series_index = match.group(1)
				    series = textpart.strip()

			else:
			    match = re.search("(\d+[,\.\d+]?)", attr_v)
			    if match is not None:
				series_index = match.group(1)
			    else:
				series_index = "0"

			series_index = series_index.replace(',','.')

			# Use Series Name from attribute "a" if not already found in attribute "v"
			if series is None:
			    series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 490: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 490: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 246 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip())
			if match is not None:
			    series = match.group(1)
			    series_index = match.group(2)

			    # Log
			    if series_index is not None:
				log.info("Extracted Series Index from Field 246: %s" % series_index)
			    if series is not None:
				log.info("Extracted Series from Field 246: %s" % series)
				series = self.cleanUpSeries(log, series, publisher_name)
			    if series is not None:
				break

		##### Field 800 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 800: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 800: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 830 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 830: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 830: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 689 #####
		# GND Subjects
		for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    subjects_gnd.append(i.text.strip())
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			if i.text.startswith("("):
			    continue
			subjects_gnd.append(i.text)

		# Log
		if len(subjects_gnd)>0:
		    log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))


		##### Fields 600-655 #####
		# Non-GND subjects
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			# ignore entries starting with "(":
			if i.text.startswith("("):
			    continue
			subjects_non_gnd.extend(re.split(',|;',i.text))
		# remove one-character subjects:
		for i in subjects_non_gnd:
		    if len(i)<2:
			subjects_non_gnd.remove(i)

		# Log
		if len(subjects_non_gnd)>0:
		    log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd))


		##### Field 250 #####
		# Edition
		for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    edition = i.text.strip()
		    break

		# Log
		if edition is not None:
		    log.info("Extracted Edition: %s" % edition)


		##### Field 41 #####
		# Languages
		for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    languages.append(i.text.strip())

		# Log
		if languages is not None:
		    log.info("Extracted Languages: %s" % ",".join(languages))


		##### If configured: Try to separate Series, Series Index and Title from the fetched title #####
		#if self.cfg_guess_series is True:
		if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True:
		    guessed_series = None
		    guessed_series_index = None
		    guessed_title = None

		    log.info("Starting Series Guesser")

		    parts = re.split("[:]",self.removeSortingCharacters(title))

		    if len(parts)==2:
			log.info("Title has two parts")
			# make sure only one part of the two parts contains digits
			if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
			    log.info("only one title part contains digits")
			    # figure out which part contains the index
			    if bool(re.search("\d",parts[0])):
				indexpart = parts[0]
				textpart = parts[1]
			    else:
				indexpart = parts[1]
				textpart = parts[0]

			    # Look at the part without digits:
			    match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart)	# remove odd characters from start and end of the text part
			    if match:
				textpart = match.group(1)

			    # Look at the part with digits:
			    # for Titleparts like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart)
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				if guessed_series is None:
				    guessed_series = textpart
				    guessed_title = textpart + " : Band " + guessed_series_index
				else:
				    guessed_title = textpart

				#log.info("ALGO1: guessed_title: " + guessed_title)
				#log.info("ALGO1: guessed_series: " + guessed_series)
				#log.info("ALGO1: guessed_series_index: " + guessed_series_index)

			    else:
				# for Titleparts like: "Episode 2 Name of the series"
				match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart)
				if match:
				    guessed_series_index = match.group(1)
				    guessed_series = match.group(2)

				    if guessed_series is None:
					# sometimes books with multiple volumes are detected as series without name -> Add the volume to the title 
					guessed_series = textpart
					guessed_title = textpart + " : Band " + guessed_series_index
				    else:
					guessed_title = textpart

				    #log.info("ALGO2: guessed_title: " + guessed_title)
				    #log.info("ALGO2: guessed_series: " + guessed_series)
				    #log.info("ALGO2: guessed_series_index: " + guessed_series_index)

				else:
				    # for titleparts like: "Band 2"
				    match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart)
				    if match:
					guessed_series_index = match.group(1)
					# ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE
					# some false positives
					match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart)
					if match:
					    guessed_series = match.group(1)
					    guessed_title = match.group(2)

					    log.info("ALGO3: guessed_title: " + guessed_title)
					    log.info("ALGO3: guessed_series: " + guessed_series)
					    log.info("ALGO3: guessed_series_index: " + guessed_series_index)


		    elif len(parts)==1:
			log.info("Title has one part")
			# for Titles like: "Name of the series - Title (Episode 2)"
			match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			if match:
			    guessed_series_index = match.group(3)
			    guessed_series = match.group(1)
			    guessed_title = match.group(2)

			    #log.info("ALGO4: guessed_title: " + guessed_title)
			    #log.info("ALGO4: guessed_series: " + guessed_series)
			    #log.info("ALGO4: guessed_series_index: " + guessed_series_index)

			else:
			    # for Titles like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				guessed_title = guessed_series + " : Band " + guessed_series_index

				#log.info("ALGO5: guessed_title: " + guessed_title)
				#log.info("ALGO5: guessed_series: " + guessed_series)
				#log.info("ALGO5: guessed_series_index: " + guessed_series_index)

		    # Log
		    if guessed_series is not None:
			log.info("Guessed Series: %s" % guessed_series)
			#guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name)
		    if guessed_series_index is not None:
			log.info("Guessed Series Index: %s" % guessed_series_index)
		    if guessed_title is not None:
			log.info("Guessed Title: %s" % guessed_title)
			guessed_title = self.cleanUpTitle(log, guessed_title)

		    if guessed_series is not None and guessed_series_index is not None and guessed_title is not None:
			title = guessed_title
			series = guessed_series
			series_index = guessed_series_index


		##### Filter exact searches #####
		# When doing an exact search for a given IDN skip books with wrong IDNs
		# TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions
		if idn is not None and "idn" in exact_search:
		    if idn != exact_search["idn"]:
			log.info("Extracted IDN does not match book's IDN, skipping record")
			continue

		##### Put it all together #####
		if self.cfg_append_edition_to_title == True and edition is not None:
		    title = title + " : " + edition

		mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors))
		mi.title_sort = self.removeSortingCharacters(title_sort)
		mi.author_sort = self.removeSortingCharacters(author_sort)
		mi.languages = languages
		mi.pubdate = pubdate
		mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)]))
		mi.series = self.removeSortingCharacters(series)
		mi.series_index = series_index
		mi.comments = comments
		mi.isbn = isbn # also required for cover download
		mi.set_identifier('urn',urn)
		mi.set_identifier('dnb-idn',idn)
		mi.set_identifier('ddc', ",".join(ddc))

		# cfg_subjects:
		# 0: use only subjects_gnd
		if self.cfg_fetch_subjects == 0:
		    mi.tags = self.uniq(subjects_gnd)
		# 1: use only subjects_gnd if found, else subjects_non_gnd
		elif self.cfg_fetch_subjects == 1:
		    if len(subjects_gnd)>0:
			mi.tags = self.uniq(subjects_gnd)
		    else:
			mi.tags = self.uniq(subjects_non_gnd)
		# 2: subjects_gnd and subjects_non_gnd
		elif self.cfg_fetch_subjects == 2:
		    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
		# 3: use only subjects_non_gnd if found, else subjects_gnd
		elif self.cfg_fetch_subjects == 3:
		    if len(subjects_non_gnd)>0:
			mi.tags = self.uniq(subjects_non_gnd)
		    else:
			mi.tags = self.uniq(subjects_gnd)
		# 4: use only subjects_non_gnd
		elif self.cfg_fetch_subjects == 4:
		    mi.tags = self.uniq(subjects_non_gnd)
		# 5: use no subjects at all
		elif self.cfg_fetch_subjects == 5:
		    mi.tags = []

		# put current result's metdata into result queue
		log.info("Final formatted result: \n%s" % mi)
		result_queue.put(mi)

Example #27

0

Show file

File: edelweiss.py Project: yunfile123/calibre

    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [
            re.sub(r'\(.*\)', '', x).strip()
            for x in astext(spans[-1]).split(',')
        ]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r' % pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        return mi

Example #28

0

Show file

File: worker.py Project: Marduke/CalimeplPacz

#!/usr/bin/env python

Example #29

0

Show file

    def parse_details(self, root):

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for query: %r' %
                               self.query)
            title = None

        if not title:
            self.log.error('Could not find title for %r' % self.query)

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for query: %r' %
                               self.query)
            authors = []

        if not authors:
            self.log.error('Could not find authors for %r' % self.query)

            return

        mi = Metadata(title, authors)

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                # match 10 of 13 getallen aan het begin, gevolgd door een spatie of niets
                p = re.compile('^([0-9]{13}|[0-9]{10})(?= |\Z)')
                if isinstance(isbn, str):
                    m = p.match(isbn)
                    if m:
                        mi.isbn = m.group()
                else:
                    m = p.match(isbn[0])
                    if m:
                        mi.isbn = m.group()
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            lang = self.parse_language(root)
            if lang:
                mi.languages = lang
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        try:
            lccn = self.parse_lccn(root)
            if lccn:
                if isinstance(lccn, str):
                    mi.set_identifier('lccn', lccn)
                else:
                    for identifier in lccn:
                        mi.set_identifier('lccn', identifier)
        except:
            self.log.exception('Error parsing LCCN for url: %r' % self.url)

        try:
            ddc = self.parse_ddc(root)
            if ddc:
                if isinstance(ddc, str):
                    mi.set_identifier('ddc', ddc)
                else:
                    for identifier in ddc:
                        mi.set_identifier('ddc', identifier)
        except:
            self.log.exception('Error parsing DDC for url: %r' % self.url)

        try:
            lcc = self.parse_lcc(root)
            if lcc:
                if isinstance(lcc, str):
                    mi.set_identifier('lcc', lcc)
                else:
                    for identifier in lcc:
                        mi.set_identifier('lcc', identifier)
        except:
            self.log.exception('Error parsing LCC for url: %r' % self.url)

        mi.source_relevance = self.relevance

        self.result_queue.put(mi)

Example #30

0

Show file

File: isbndb.py Project: MarioJC/calibre

    def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers):
        from lxml import etree

        def tostring(x):
            if x is None:
                return ''
            return etree.tostring(x, method='text', encoding=unicode).strip()

        orig_isbn = identifiers.get('isbn', None)
        title_tokens = list(self.get_title_tokens(orig_title))
        author_tokens = list(self.get_author_tokens(orig_authors))
        results = []

        def ismatch(title, authors):
            authors = lower(' '.join(authors))
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            amatch = not author_tokens
            for a in author_tokens:
                if lower(a) in authors:
                    amatch = True
                    break
            if not author_tokens:
                amatch = True
            return match and amatch

        bl = feed.find('BookList')
        if bl is None:
            err = tostring(feed.find('errormessage'))
            raise ValueError('ISBNDb query failed:' + err)
        total_results = int(bl.get('total_results'))
        shown_results = int(bl.get('shown_results'))
        for bd in bl.xpath('.//BookData'):
            isbn = check_isbn(bd.get('isbn', None))
            isbn13 = check_isbn(bd.get('isbn13', None))
            if not isbn and not isbn13:
                continue
            if orig_isbn and orig_isbn not in {isbn, isbn13}:
                continue
            title = tostring(bd.find('Title'))
            if not title:
                continue
            authors = []
            for au in bd.xpath('.//Authors/Person'):
                au = tostring(au)
                if au:
                    if ',' in au:
                        ln, _, fn = au.partition(',')
                        au = fn.strip() + ' ' + ln.strip()
                authors.append(au)
            if not authors:
                continue
            comments = tostring(bd.find('Summary'))
            id_ = (title, tuple(authors))
            if id_ in seen:
                continue
            seen.add(id_)
            if not ismatch(title, authors):
                continue
            publisher = tostring(bd.find('PublisherText'))
            if not publisher:
                publisher = None
            if publisher and 'audio' in publisher.lower():
                continue
            mi = Metadata(title, authors)
            mi.isbn = isbn
            mi.publisher = publisher
            mi.comments = comments
            results.append(mi)
        return total_results, shown_results, results

Example #31

0

Show file

File: edelweiss.py Project: bjhemens/calibre

    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE

        root = parse_html(raw)
        sku = CSSSelect("div.sku.attGroup")(root)[0]
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find("div")
        spans = banner.findall("span")
        title = ""
        for i, span in enumerate(spans):
            if i == 0 or "12pt" in span.get("style", ""):
                title += astext(span)
            else:
                break
        authors = [re.sub(r"\(.*\)", "", x).strip() for x in astext(spans[-1]).split(",")]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(",")]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier("edelweiss", self.sku)

        # Tags
        bisac = CSSSelect("div.bisac.attGroup")(root)
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(",")]
            mi.tags = [t[1:].strip() if t.startswith("&") else t for t in mi.tags]

        # Publisher
        pub = CSSSelect("div.supplier.attGroup")(root)
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = CSSSelect("div.shipDate.attGroupItem")(root)
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(":")[0::2]
            pub = parts[1] or parts[0]
            try:
                if ", Ship Date:" in pub:
                    pub = pub.partition(", Ship Date:")[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception("Error parsing published date: %r" % pub)

        # Comments
        comm = ""
        general = CSSSelect("div#pd-general-overview-content")(root)
        if general:
            q = self.render_comments(general[0])
            if q != "<p>No title summary available. </p>":
                comm += q
        general = CSSSelect("div#pd-general-contributor-content")(root)
        if general:
            comm += self.render_comments(general[0])
        general = CSSSelect("div#pd-general-quotes-content")(root)
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = CSSSelect("img.title-image[src]")(root)
        if img:
            href = img[0].get("src").replace("jacket_covers/medium/", "jacket_covers/flyout/")
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None

        return mi

Example #32

0

Show file

    def _GoodreadsBook_to_Metadata(self, book):
        # type: (_GoodreadsBook) -> Metadata
        """
        :param book: _GoodreadsBook: book
        :return: Metadata: Metadata
        """
        mi = Metadata(book.title, book.authors)
        mi.source_relevance = 0
        mi.set_identifier('goodreads', book.id)

        if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get(
                'isbn'):
            mi.set_identifier('isbn', '')

        if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']:
            mi.set_identifier('amazon', book.asin)

        if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']:
            try:
                if len(book.isbn) == 10:
                    mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn))
                else:
                    mi.isbn = check_isbn13(book.isbn)
            except:
                self.log.error("ISBN CONVERSION ERROR:", book.isbn)
                self.log.exception()

        if book.image_url:
            self.log.info('cache_identifier_to_cover_url:', book.asin, ':',
                          book.image_url)
            self.cache_identifier_to_cover_url(book.id, book.image_url)

        if book.publisher:
            self.log.info('book.publisher is:', book.publisher)
            mi.publisher = book.publisher

        if book.pubdate:
            self.log.info('book.pubdate is:',
                          book.pubdate.strftime('%Y-%m-%d'))
            mi.pubdate = book.pubdate

        if book.comments:
            self.log.info('book.editorial_review is:', book.comments)
            mi.comments = book.comments

        tags = self.prefs['ADD_THESE_TAGS'].split(',')
        tags.extend(book.tags)
        # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings']
        # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags))))

        if book.series:
            mi.series = book.series
            self.log.info(u'series:', book.series)
            if book.series_index:
                mi.series_index = book.series_index
                self.log.info(u'series_index:',
                              "{0:.2f}".format(book.series_index))
            else:
                mi.series_index = 0

        if book.average_rating:
            mi.rating = book.average_rating

        self.clean_downloaded_metadata(mi)

        return mi

Example #33

0

Show file

File: worker.py Project: hojel/calibre-aladin-metadata-plugin

    def parse_details(self, root):
        try:
            isbn = self.extract_isbn(self.url)
        except:
            self.log.exception('No ISBN in URL: %r'%self.url)
            isbn = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not isbn:
            self.log.error('Could not find title/authors/Aladin id for %r'%self.url)
            self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.set_identifier('isbn', isbn)
        mi.isbn = isbn
        self.isbn = isbn

        # ISBN-13
        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!
        if mi.has_cover:
            self.log.info('Cover URL: '+mi.cover_url)

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)

Example #34

0

Show file

File: isbndb.py Project: zengchunyun/calibre

    def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers):
        from lxml import etree

        def tostring(x):
            if x is None:
                return ''
            return etree.tostring(x, method='text', encoding=unicode).strip()

        orig_isbn = identifiers.get('isbn', None)
        title_tokens = list(self.get_title_tokens(orig_title))
        author_tokens = list(self.get_author_tokens(orig_authors))
        results = []

        def ismatch(title, authors):
            authors = lower(' '.join(authors))
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            amatch = not author_tokens
            for a in author_tokens:
                if lower(a) in authors:
                    amatch = True
                    break
            if not author_tokens:
                amatch = True
            return match and amatch

        bl = feed.find('BookList')
        if bl is None:
            err = tostring(feed.find('errormessage'))
            raise ValueError('ISBNDb query failed:' + err)
        total_results = int(bl.get('total_results'))
        shown_results = int(bl.get('shown_results'))
        for bd in bl.xpath('.//BookData'):
            isbn = check_isbn(bd.get('isbn', None))
            isbn13 = check_isbn(bd.get('isbn13', None))
            if not isbn and not isbn13:
                continue
            if orig_isbn and orig_isbn not in {isbn, isbn13}:
                continue
            title = tostring(bd.find('Title'))
            if not title:
                continue
            authors = []
            for au in bd.xpath('.//Authors/Person'):
                au = tostring(au)
                if au:
                    if ',' in au:
                        ln, _, fn = au.partition(',')
                        au = fn.strip() + ' ' + ln.strip()
                authors.append(au)
            if not authors:
                continue
            comments = tostring(bd.find('Summary'))
            id_ = (title, tuple(authors))
            if id_ in seen:
                continue
            seen.add(id_)
            if not ismatch(title, authors):
                continue
            publisher = tostring(bd.find('PublisherText'))
            if not publisher:
                publisher = None
            if publisher and 'audio' in publisher.lower():
                continue
            mi = Metadata(title, authors)
            mi.isbn = isbn
            mi.publisher = publisher
            mi.comments = comments
            results.append(mi)
        return total_results, shown_results, results

Example #35

0

Show file

File: __init__.py Project: mickkn/calibre_metadata_saxo

    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the json data within the HTML code (some stuff is easier to get with json)
        try:
            json_raw = root.xpath('(//script[@type="application/ld+json"])[2]')
            json_root = json.loads(json_raw[0].text.strip())
            #print(json.dumps(json_root, indent=4, sort_keys=True))
        except:
            self.log.error("Error loading JSON data")
            return

        # Get the title of the book
        try:
            self.title = json_root['name']
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath(
                '//h2[@class="product-page-heading__autor"]//a')
            for name in author_node:
                self.authors.append(name.text.strip())
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Some books have ratings, let's use them.
        try:
            self.rating = float(json_root['aggregateRating']['ratingValue'])
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            self.isbn = json_root['isbn']
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            self.comments = parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            self.cover_url = json_root['image']
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            self.publisher = json_root['publisher']['name']
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language = json_root['inLanguage']['name']
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy
            pubdate_node = root.xpath(
                '//div[@class="product-page-block__container"]//dd'
            )  # Format dd-mm-yyyy
            date_str = pubdate_node[0].text.strip()
            format_str = '%d-%m-%Y'  # The format
            self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('saxo', self.url)

        # Set rating
        if self.rating:
            try:
                meta_data.rating = self.rating
            except:
                self.log.exception('Error loading rating')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)

Example #36

0

Show file

File: google.py Project: yunfile123/calibre

def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    XPath = partial(etree.XPath, namespaces=NAMESPACES)

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    creator        = XPath('descendant::dc:creator')
    identifier     = XPath('descendant::dc:identifier')
    title          = XPath('descendant::dc:title')
    date           = XPath('descendant::dc:date')
    publisher      = XPath('descendant::dc:publisher')
    subject        = XPath('descendant::dc:subject')
    description    = XPath('descendant::dc:description')
    language       = XPath('descendant::dc:language')
    rating         = XPath('descendant::gd:rating[@average]')

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google':google_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    for x in rating(extra):
        try:
            mi.rating = float(x.get('average'))
            if mi.rating > 5:
                mi.rating /= 2
        except:
            log.exception('Failed to parse rating')

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
        mi.has_google_cover = x.get('href')
        break

    return mi

Example #37

0

Show file

File: __init__.py Project: gismo2006/calibre-dnb

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        self.load_config()

        # get identifying tags from book
        idn = identifiers.get('dnb-idn', None)
        isbn = check_isbn(identifiers.get('isbn', None))

        # ignore unknown authors
        if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt":
            authors = None

        if (isbn is None) and (idn is None) and (title is None) and (authors is
                                                                     None):
            log.info(
                "This plugin requires at least either ISBN, IDN, Title or Author(s)."
            )
            return None

        queries = []
        # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
        exact_search = {}

        if idn is not None:
            queries.append('num=' + idn)
            exact_search['idn'] = idn

        else:
            authors_v = []
            title_v = []

            if authors is not None:
                authors_v.append(' '.join(authors))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=False)))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=True)))

            if title is not None:
                title_v.append(title)
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=False)))
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=True)))

            if isbn is not None:
                exact_search['isbn'] = isbn

            # title and author
            if authors is not None and title is not None:
                for a in authors_v:
                    for t in title_v:
                        if isbn is not None:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '" AND num="' + isbn + '"')
                        else:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '" AND num="' + isbn + '"')
                else:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '"')

            # title but no author
            elif authors is not None and title is None:
                for i in authors_v:
                    if isbn is not None:
                        queries.append('per="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('per="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('tit="' + authors[0] + '" AND num="' +
                                   isbn + '"')
                else:
                    queries.append('tit="' + authors[0] + '"')

            # author but no title
            elif authors is None and title is not None:
                for i in title_v:
                    if isbn is not None:
                        queries.append('tit="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('tit="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND num="' + isbn +
                                   '"')
                else:
                    queries.append('per="' + title + '"')

            # as last resort only use isbn
            if isbn is not None:
                queries.append('num=' + isbn)

            # Sort queries descending by length (assumption: longer query -> less but better results)
            #queries.sort(key=len)
            #queries.reverse()

        # remove duplicate queries
        uniqueQueries = []
        for i in queries:
            if i not in uniqueQueries:
                uniqueQueries.append(i)

        # Process queries
        results = None

        for query in uniqueQueries:
            query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)'
            log.info(query)

            if self.cfg_dnb_token is None:
                results = self.getSearchResultsByScraping(log, query, timeout)
            else:
                results = self.getSearchResults(log, query, timeout)

            if results is None:
                continue

            log.info("Parsing records")

            ns = {'marc21': 'http://www.loc.gov/MARC21/slim'}
            for record in results:
                series = None
                series_index = None
                publisher = None
                pubdate = None
                languages = []
                title = None
                title_sort = None
                edition = None
                comments = None
                idn = None
                urn = None
                isbn = None
                ddc = []
                subjects_gnd = []
                subjects_non_gnd = []

                # Title: Field 245
                title_parts = []
                # if a,n,p exist: series = a, series_index = n, title = p
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..",
                        namespaces=ns):
                    series_index = i.xpath(".//marc21:subfield[@code='n']",
                                           namespaces=ns)[0].text.strip()
                    match = re.search("(\d+[,\.\d+]?)", series_index)
                    if match:
                        series_index = match.group(1)
                    else:
                        series_index = "0"  # looks like sometimes DNB does not know the series index and uses something like "[...]"
                    series_index = series_index.replace(',', '.')
                    series = i.xpath(".//marc21:subfield[@code='a']",
                                     namespaces=ns)[0].text.strip()
                    title_parts.append(
                        i.xpath(".//marc21:subfield[@code='p']",
                                namespaces=ns)[0].text.strip())
                    log.info("Extracted Series: %s" % series)
                    log.info("Extracted Series Index: %s" % series_index)
                    break
                # otherwise: title = a
                if len(title_parts) == 0:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):
                        title_parts.append(i.text.strip())
                        break

                # subtitle 1
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns):
                    title_parts.append(i.text.strip())
                    break

                # subtitle 2
                #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
                #    title = title + " / " + i.text.strip()
                #    break

                title = " : ".join(title_parts)
                log.info("Extracted Title: %s" % title)

                # Title_Sort
                title_sort_parts = list(title_parts)
                title_sort_regex = re.match(
                    '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$',
                    title_parts[0])
                sortword = title_sort_regex.group(2)
                if sortword:
                    title_sort_parts[0] = ''.join(
                        filter(None, [
                            title_sort_regex.group(1).strip(),
                            title_sort_regex.group(3).strip(), ", " + sortword
                        ]))
                title_sort = " : ".join(title_sort_parts)
                log.info("Extracted Title_Sort: %s" % title_sort)

                # Authors
                authors = []
                author_sort = None
                for i in record.xpath(
                        ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # primary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # secondary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                if len(
                        authors
                ) == 0:  # if no "real" autor was found take all persons involved
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):  # secondary authors
                        name = re.sub(" \[.*\]$", "", i.text.strip())
                        authors.append(name)
                if len(authors) > 0:
                    author_sort = authors[0]
                log.info("Extracted Authors: %s" % " & ".join(authors))

                # Comments
                for i in record.xpath(
                        ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",
                        namespaces=ns):
                    if i.text.startswith("http://deposit.dnb.de/"):
                        br = self.browser
                        log.info('Downloading Comments from: %s' % i.text)
                        try:
                            comments = br.open_novisit(i.text,
                                                       timeout=30).read()
                            comments = sanitize_comments_html(comments)
                            log.info('Comments: %s' % comments)
                            break
                        except:
                            log.info("Could not download Comments from %s" % i)

                # Publisher Name and Location
                publisher_name = None
                publisher_location = None
                fields = record.xpath(
                    ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                    namespaces=ns)
                if len(fields) > 0:
                    publisher_name = fields[0].xpath(
                        ".//marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                    publisher_location = fields[0].xpath(
                        ".//marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                else:
                    fields = record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",
                        namespaces=ns)
                    if len(fields) > 0:
                        publisher_name = fields[0].xpath(
                            ".//marc21:subfield[@code='b' and string-length(text())>0]",
                            namespaces=ns)[0].text.strip()
                    else:
                        fields = record.xpath(
                            ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",
                            namespaces=ns)
                        if len(fields) > 0:
                            publisher_location = fields[0].xpath(
                                ".//marc21:subfield[@code='a' and string-length(text())>0]",
                                namespaces=ns)[0].text.strip()

                log.info("Extracted Publisher: %s" % publisher_name)
                log.info("Extracted Publisher Location: %s" %
                         publisher_location)

                # Publishing Date
                for i in record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",
                        namespaces=ns):
                    match = re.search("(\d{4})", i.text.strip())
                    if match is not None:
                        year = match.group(1)
                        pubdate = datetime.datetime(int(year), 1, 2)
                        break
                log.info("Extracted Publication Year: %s" % pubdate)

                # ID: IDN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    idn = i.text.strip()
                    break
                log.info("Extracted ID IDN: %s" % idn)
                if "idn" in exact_search:
                    if idn != exact_search["idn"]:
                        log.info(
                            "Extracted IDN does not match book's IDN, skipping record"
                        )
                        continue

                # ID: URN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    urn = i.text.strip()
                    break
                log.info("Extracted ID URN: %s" % urn)

                # ID: ISBN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
                    match = re.search(isbn_regex, i.text.strip())
                    isbn = match.group()
                    isbn = isbn.replace('-', '')
                    break
                log.info("Extracted ID ISBN: %s" % isbn)
                if "isbn" in exact_search:
                    if isbn != exact_search["isbn"]:
                        log.info(
                            "Extracted ISBN does not match book's ISBN, skipping record"
                        )
                        continue

                # ID: Sachgruppe (DDC)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    ddc.append(i.text.strip())
                log.info("Extracted ID DDC: %s" % ",".join(ddc))

                # Series and Series_Index
                if series is None and series_index is None:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                            namespaces=ns):
                        # Series Index
                        series_index = i.xpath(".//marc21:subfield[@code='v']",
                                               namespaces=ns)[0].text.strip()
                        match = re.search("(\d+[,\.\d+]?)", series_index)
                        if match is not None:
                            series_index = match.group(1)
                        else:
                            series_index = "0"
                        series_index = series_index.replace(',', '.')
                        log.info("Extracted Series Index: %s" % series_index)
                        # Series
                        series = i.xpath(".//marc21:subfield[@code='a']",
                                         namespaces=ns)[0].text.strip()
                        log.info("Extracted Series: %s" % series)
                        break

                # Try to extract Series, Series Index and Title from the fetched title.
                # Caution: This overwrites DNB's series/series_index and modifies the title!
                if self.cfg_guess_series is True:
                    guessed_series = None
                    guessed_series_index = None
                    parts = re.split("[:]",
                                     self.removeSortingCharacters(title))
                    if len(parts) == 2:
                        if bool(re.search("\d", parts[0])) != bool(
                                re.search("\d", parts[1])):
                            # figure out which part contains the index
                            if bool(re.search("\d", parts[0])):
                                indexpart = parts[0]
                                textpart = parts[1]
                            else:
                                indexpart = parts[1]
                                textpart = parts[0]

                            match = re.match(
                                "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart
                            )  # remove odd characters from start and end of the text part
                            if match:
                                textpart = match.group(1)

                            # from Titleparts like: "Name of the series - Episode 2"	OK
                            match = re.match(
                                "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                indexpart)
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                if guessed_series is None:
                                    guessed_series = textpart
                                    title = textpart + " : Band " + guessed_series_index
                                else:
                                    title = textpart
                            else:
                                # from Titleparts like: "Episode 2 Name of the series"
                                match = re.match(
                                    "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$",
                                    indexpart)
                                if match:
                                    guessed_series_index = match.group(1)
                                    guessed_series = match.group(2)
                                    if guessed_series is None:
                                        guessed_series = textpart
                                        title = textpart + " : Band " + guessed_series_index
                                    else:
                                        title = textpart
                    elif len(parts) == 1:
                        # from Titles like: "Name of the series - Title (Episode 2)"
                        match = re.match(
                            "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                            parts[0])
                        if match:
                            guessed_series_index = match.group(3)
                            guessed_series = match.group(1)
                            title = match.group(2)

                        else:
                            # from Titles like: "Name of the series - Episode 2"
                            match = re.match(
                                "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                parts[0])
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                title = guessed_series + " : Band " + guessed_series_index

                    if guessed_series is not None and guessed_series_index is not None:
                        series = guessed_series
                        series_index = guessed_series_index
                        log.info("Guessed Series: %s" % series)
                        log.info("Guessed Series Index: %s" % series_index)

                # GND Subjects from 689
                for i in record.xpath(
                        ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    subjects_gnd.append(i.text.strip())
                # GND Subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        if i.text.startswith("("):
                            continue
                        subjects_gnd.append(i.text)
                log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))

                # Non-GND subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        # ignore entries starting with "(":
                        if i.text.startswith("("):
                            continue
                        subjects_non_gnd.extend(re.split(',|;', i.text))
                # remove one-character subjects:
                for i in subjects_non_gnd:
                    if len(i) < 2:
                        subjects_non_gnd.remove(i)
                log.info("Extracted non-GND Subjects: %s" %
                         " ".join(subjects_non_gnd))

                # Edition
                for i in record.xpath(
                        ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    edition = i.text.strip()
                    break
                log.info("Extracted Edition: %s" % edition)

                # Languages
                for i in record.xpath(
                        ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    languages.append(i.text.strip())
                if languages is not None:
                    log.info("Extracted Languages: %s" % ",".join(languages))

                # Put it all together
                if self.cfg_append_edition_to_title == True and edition is not None:
                    title = title + " : " + edition

                mi = Metadata(
                    self.removeSortingCharacters(title),
                    map(lambda i: self.removeSortingCharacters(i), authors))
                mi.title_sort = self.removeSortingCharacters(title_sort)
                mi.author_sort = self.removeSortingCharacters(author_sort)
                mi.languages = languages
                mi.pubdate = pubdate
                mi.publisher = " : ".join(
                    filter(None, [
                        publisher_location,
                        self.removeSortingCharacters(publisher_name)
                    ]))
                mi.series = self.removeSortingCharacters(series)
                mi.series_index = series_index
                mi.comments = comments
                mi.isbn = isbn  # also required for cover download
                mi.set_identifier('urn', urn)
                mi.set_identifier('dnb-idn', idn)
                mi.set_identifier('ddc', ",".join(ddc))

                if self.cfg_fetch_subjects == 0:
                    mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 1:
                    if len(subjects_gnd) > 0:
                        mi.tags = self.uniq(subjects_gnd)
                    else:
                        mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 2:
                    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
                elif self.cfg_fetch_subjects == 3:
                    if len(subjects_non_gnd) > 0:
                        mi.tags = self.uniq(subjects_non_gnd)
                    else:
                        mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 4:
                    mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 5:
                    mi.tags = []

                # put current result's metdata into result queue
                log.info("Final formatted result: %s" % mi)
                result_queue.put(mi)

Example #38

0

Show file

File: douban.py Project: teymour-aldridge/calibre

    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        douban_id = entry_.get('id')
        title = entry_.get('title')
        description = entry_.get('summary')
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get('publisher')
        isbn = entry_.get('isbn13')  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get('pubdate')
        authors = entry_.get('author')
        book_tags = entry_.get('tags')
        rating = entry_.get('rating')
        cover_url = entry_.get('images', {}).get('large')
        series = entry_.get('series')

        if not authors:
            authors = [_('Unknown')]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {'douban': douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(''), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = [tag['name'] for tag in book_tags]

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        if rating:
            try:
                mi.rating = float(rating['average']) / 2.0
            except:
                log.exception('Failed to parse rating')
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find('book-default') == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series['title']

        return mi

Example #39

0

Show file

    def extract_vol_details(self, vol_url):
        # Here we extract and format the information from the choosen volume.
        # - The first name and last name to populate author and author sort : vol_auteur_prenom  and vol_auteur_nom
        # - The title of the volume                                         : vol_title
        # - The serie name the volume is part of                            : vol_serie
        # - The sequence number in the serie                                : vol_serie_seq                         # missing
        # - The editor of this volume                                       : vol_editor
        # - The editor's collection of this volume                          : vol_coll
        # - The collection serial code of this volume                       : vol_coll_srl
        # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl                            # date format to be computed
        # - The ISBN number assoi-ciated with the volume                    : vol_isbn
        # - The volume tags                                                 : vol_genre
        # - The url pointer to the volume cover image                       : vol_cover_index
        # - The comments includes various info about the book               : vol_comment_soup
        #   . reference, an url pointer to noosfere
        #   . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume
        #   . first edition information
        #   . serie (cycle) name and number
        #   . this volume editor info
        #   . Resume (quatrième de couverture)
        #   . Critiques
        #   . Sommaire detailing what novels are in the volume when it is an anthology
        #   . Critiques about the serie and/or about another volume of the book
        #

        debug = self.dbg_lvl & 2
        self.log.info(self.who, "\nIn extract_vol_details(soup)")
        if debug:
            self.log.info(self.who, "vol_url       : ", vol_url)

        if debug:
            self.log.info(
                self.who,
                "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')"
            )
            self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who)
        rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who)
        soup = rsp[0]
        url_vrai = rsp[1].replace("&Tri=3", "")
        #        if debug: self.log.info(self.who,soup.prettify())              # useful but too big...

        self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace(
            '?', '&').replace('=', '&').split('&')[2]
        # self.nsfr_id = (self.nfsr_id).strip("$")                        # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ???
        tmp = self.nsfr_id
        self.nsfr_id = tmp.strip('$')

        if debug:
            self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id,
                          type(self.nsfr_id))

        tmp_lst = []
        vol_info = {}
        vol_title = ""
        vol_auteur = ""
        vol_auteur_prenom = ""
        vol_auteur_nom = ""
        vol_serie = ""
        vol_serie_seq = ""
        vol_editor = ""
        vol_coll = ""
        vol_coll_srl = ""
        vol_dp_lgl = ""
        vol_isbn = ""
        vol_genre = ""
        vol_cover_index = ""
        comment_generic = None
        comment_resume = None
        comment_Critiques = None
        comment_Sommaire = None
        comment_AutresCritique = None
        comment_cover = None
        comment_decoupage_annexe = None

        # add volume address as a reference in the comment
        vol_comment_soup = BS(
            '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai +
            '</a></p></div>', "lxml")
        if debug: self.log.info(self.who, "vol reference processed")

        if soup.select("span[class='TitreNiourf']"):
            vol_title = soup.select(
                "span[class='TitreNiourf']")[0].text.strip()
        if debug: self.log.info(self.who, "vol_title processed : ", vol_title)

        if soup.select("span[class='AuteurNiourf']"):
            vol_auteur = soup.select(
                "span[class='AuteurNiourf']")[0].text.replace("\n",
                                                              "").strip()
        if debug:
            self.log.info(self.who, "vol_auteur processed : ", vol_auteur)
        for i in range(len(vol_auteur.split())):
            if not vol_auteur.split()[i].isupper():
                vol_auteur_prenom += " " + vol_auteur.split()[i]
            else:
                vol_auteur_nom += " " + vol_auteur.split()[i].title()
        vol_auteur = vol_auteur.title()
        vol_auteur_prenom = vol_auteur_prenom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_prenom processed : ",
                          vol_auteur_prenom)
        vol_auteur_nom = vol_auteur_nom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_nom processed : ",
                          vol_auteur_nom)

        if soup.select("a[href*='serie.asp']"):
            if soup.select("a[href*='serie.asp']")[0].find_parent(
                    "span", {"class": "ficheNiourf"}):
                vol_serie = soup.select("a[href*='serie.asp']")[0].text
                tmp_vss = [
                    x for x in soup.select("a[href*='serie.asp']")
                    [0].parent.stripped_strings
                ]
                for i in range(len(tmp_vss)):
                    if "vol." in tmp_vss[i]:
                        if not vol_serie_seq:
                            vol_serie_seq = tmp_vss[i].replace("vol.",
                                                               "").strip()
                    if "découpage" in tmp_vss[i]:
                        dec_anx_url = "https://www.noosfere.org/livres/" + soup.select(
                            "a[href*='serie.asp']")[0]['href']
                        comment_pre_decoupage_annexe = BS(
                            '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>',
                            "lxml")
                        comment_decoupage_annexe = self.get_decoupage_annexe(
                            dec_anx_url)
                if debug:
                    self.log.info(self.who,
                                  "vol_serie, vol_serie_seq processed : ",
                                  vol_serie, ",", vol_serie_seq)

        comment_generic = soup.select("span[class='ficheNiourf']")[0]
        new_div = soup.new_tag('div')
        comment_generic = comment_generic.wrap(new_div)
        if debug: self.log.info(self.who, "comment_generic processed")

        if soup.select("a[href*='editeur.asp']"):
            vol_editor = soup.select("a[href*='editeur.asp']")[0].text
        if debug:
            self.log.info(self.who, "vol_editor processed : ", vol_editor)

        if soup.select("a[href*='collection.asp']"):
            vol_coll = soup.select("a[href*='collection.asp']")[0].text
        if debug: self.log.info(self.who, "vol_coll : ", vol_coll)

        for i in comment_generic.stripped_strings:
            tmp_lst.append(str(i))
        vol_coll_srl = tmp_lst[len(tmp_lst) - 1]
        if "n°" in vol_coll_srl:
            for k in ["n°", "(", ")"]:
                if k in vol_coll_srl:
                    vol_coll_srl = vol_coll_srl.replace(k, "")
            vol_coll_srl = vol_coll_srl.strip()
            vol_coll_srl = vol_coll_srl.split("/")[0]
            if vol_coll_srl[0].isnumeric():
                vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:]
        else:
            vol_coll_srl = ""
        if debug:
            self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl)

        # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead
        # note that I 'calculate' the missing day of the month and even sometimes the missing month
        ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet",
              "août", "septembre", "octobre", "novembre", "décembre")
        for elemnt in soup.select_one(
                "span[class='sousFicheNiourf']").stripped_strings:
            if debug: self.log.info(self.who, "elemnt : ", elemnt)
            if not vol_dp_lgl:
                elemn = (elemnt.replace("Dépôt légal :",
                                        "").split(','))[0].strip()
                if elemn:
                    if elemn.isnumeric() and len(elemn) == 4:
                        vol_dp_lgl = datetime.datetime.strptime(
                            "175 " + elemn, "%j %Y")
                    elif "semestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:]
                            + " " + ele[2], "%j %Y")
                    elif "trimestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:]
                            + " " + ele[2], "%j %Y")
                    else:
                        for i in range(len(ms)):
                            if ms[i] in elemn:
                                ele = elemn.split()
                                vol_dp_lgl = datetime.datetime.strptime(
                                    ("000" + str(10 + 31 * i))[-3:] + " " +
                                    ele[1], "%j %Y")
                                break
                    if debug:
                        self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl)

            if "ISBN" in elemnt:
                vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '')
                if "néant" in vol_isbn: vol_isbn = ""
                if debug:
                    self.log.info(self.who, "vol_isbn processed : ", vol_isbn)

            if "Genre" in elemnt:
                vol_genre = elemnt.lstrip("Genre : ")
                if debug:
                    self.log.info(self.who, "vol_genre processed : ",
                                  vol_genre)

        if soup.select("img[name='couverture']"):
            for elemnt in repr(
                    soup.select("img[name='couverture']")[0]).split('"'):
                if "http" in elemnt:
                    if not vol_cover_index:
                        vol_cover_index = elemnt
                        if debug:
                            self.log.info(self.who,
                                          "vol_cover_index processed : ",
                                          vol_cover_index)

        # add cover image address as a reference in the comment
        if vol_cover_index:
            comment_cover = BS(
                '<div><p>Couverture: <a href="' + vol_cover_index + '">' +
                vol_cover_index + '</a></p></div>', "lxml")

    # select the fields I want... More exist such as film adaptations or references to advises to read
    # but that is not quite consistant around all the books (noosfere is a common database from many people)
    # and beside I have enough info like that AND I do NOT want to take out the noosfere's business

        tmp_comm_lst = soup.select("span[class='AuteurNiourf']")
        if debug: self.log.info(self.who, tmp_comm_lst)  #usefull but too long
        for i in range(len(tmp_comm_lst)):
            if "Quatrième de couverture" in str(tmp_comm_lst[i]):
                comment_resume = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_resume processed")

            if "Critiques" in str(tmp_comm_lst[i]):
                if not "autres" in str(tmp_comm_lst[i]):
                    comment_Critiques = tmp_comm_lst[i].find_parents(
                        "div", {'class': 'sousbloc'})[0]
                    if debug:
                        self.log.info(self.who, "comment_Critiques processed")

            if "Sommaire" in str(tmp_comm_lst[i]):
                comment_Sommaire = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_Sommaire processed")

            if "Critiques des autres" in str(tmp_comm_lst[i]):
                comment_AutresCritique = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]

                if comment_AutresCritique.select('a[href*="serie.asp"]') and (
                        "Critique de la série" in comment_AutresCritique.
                        select('a[href*="serie.asp"]')[0].text):
                    critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select(
                        'a[href*="serie.asp"]')[0]['href']
                    try:
                        more_comment_AutresCritique = self.get_Critique_de_la_serie(
                            critic_url)
                        comment_AutresCritique.append(
                            more_comment_AutresCritique)
                    except:
                        self.log.exception(
                            "get_Critique_de_la_serie failed for url: ",
                            critic_url)

                if debug:
                    self.log.info(self.who, "comment_AutresCritique processed")

    # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-))

        if comment_cover:
            vol_comment_soup.append(comment_cover)
        if comment_generic:
            vol_comment_soup.append(comment_generic)
        if comment_resume:
            vol_comment_soup.append(comment_resume)
        if comment_Critiques:
            vol_comment_soup.append(comment_Critiques)
        if comment_Sommaire:
            vol_comment_soup.append(comment_Sommaire)
        if comment_AutresCritique:
            vol_comment_soup.append(comment_AutresCritique)
        if comment_decoupage_annexe:
            vol_comment_soup.append(
                comment_pre_decoupage_annexe)  # this is the title
            vol_comment_soup.append(comment_decoupage_annexe)

    #
    # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs
    # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning)
    # - I like to have functional url when they exist
    # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) )

        for elemnt in vol_comment_soup.select('[align="justify"]'):
            del elemnt['align']

    # remove all double or triple 'br' to improve presentation.
    # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) )
    #
    # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire...
    # donc vol_comment_soup est modifié...
    #

        tmp1 = tmp2 = "lrp_the_unique"
        for elemnt in vol_comment_soup.findAll():
            tmp1, tmp2 = tmp2, elemnt
            if tmp1 == tmp2:
                elemnt.extract()

        br = soup.new_tag('br')
        for elemnt in vol_comment_soup.select('.AuteurNiourf'):
            elemnt.insert(0, br)
            elemnt["style"] = "font-weight: 600; font-size: 18px"

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet avant correction: ",
                                  elemnt)

        for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/auteur.asp",
                    "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='/heberg/']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/heberg/", "https://www.noosfere.org/heberg/")

        for elemnt in vol_comment_soup.select(
                "a[href*='./EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='heberg']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../../heberg", "https://www.noosfere.org/heberg")
        for elemnt in vol_comment_soup.select("a[href*='../bd']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../bd", "https://www.noosfere.org/bd")

        for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "auteur.asp", "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='collection.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "collection.asp",
                    "https://www.noosfere.org/livres/collection.asp")
        for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "critsign.asp",
                    "https://www.noosfere.org/livres/critsign.asp")
        for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editeur.asp",
                    "https://www.noosfere.org/livres/editeur.asp")
        for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editionslivre.asp",
                    "https://www.noosfere.org/livres/editionslivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "niourf.asp", "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='serie.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "serie.asp", "https://www.noosfere.org/livres/serie.asp")

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet apres correction: ",
                                  elemnt)

        fg, fd = "<<==", "==>>"  #chr(0x21D0),chr(0x21D2)   #chr(0x27f8),chr(0x27f9)
        for elemnt in vol_comment_soup.select("img[src*='arrow_left']"):
            elemnt.replace_with(fg)
        for elemnt in vol_comment_soup.select("img[src*='arrow_right']"):
            elemnt.replace_with(fd)

        # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €)
        # only set vol_coll_srl if vol_coll exists
        # the idea is to use search and replace in the edit Metadata in bulk window.

        if self.extended_publisher:
            if debug:
                self.log.info(
                    self.who,
                    """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set"""
                )
            if vol_coll:
                if debug: self.log.info(self.who, 'add collection')
                vol_editor = vol_editor + ('§') + vol_coll
                if vol_coll_srl:
                    if debug: self.log.info(self.who, 'add collection number')
                    vol_editor = vol_editor + ('€') + vol_coll_srl

        if vol_serie:
            if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq)
            else: vol_serie_seq = 1.0

        # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings
        # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage:
        # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
        # Side note:
        # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) )
        #
        # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien...
        # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je
        # lisais mal et je pensais à une incompatibilité avec la structure xml),
        #
        vol_comment_soup = vol_comment_soup.encode('ascii',
                                                   'xmlcharrefreplace')

        self.log.info(self.who, "+++" * 25)
        self.log.info(self.who,
                      "nsfr_id, type()                : ", self.nsfr_id,
                      type(self.nsfr_id))  # must be <class 'str'>
        self.log.info(self.who,
                      "relevance, type()              : ", self.relevance,
                      type(self.relevance))  # must be <class 'float'>
        self.log.info(self.who, "vol_title, type()              : ", vol_title,
                      type(vol_title))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_auteur, type()             : ", vol_auteur,
            type(vol_auteur))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_prenom, type()      : ", vol_auteur_prenom,
                      type(vol_auteur_prenom))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_nom, type()         : ", vol_auteur_nom,
                      type(vol_auteur_nom))  # must be <class 'str'>
        if vol_serie:
            self.log.info(self.who, "vol_serie, type()              : ",
                          vol_serie, type(vol_serie))  # must be <class 'str'>
            self.log.info(self.who,
                          "vol_serie_seq, type()          : ", vol_serie_seq,
                          type(vol_serie_seq))  # must be <class 'float'>
        self.log.info(self.who, "vol_editor, type()             : ",
                      vol_editor, type(vol_editor))  # must be <class 'str'>
        self.log.info(self.who, "vol_coll, type()               : ", vol_coll,
                      type(vol_coll))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_coll_srl, type()           : ", vol_coll_srl,
                      type(vol_coll_srl))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_dp_lgl, type()             : ", vol_dp_lgl,
            type(vol_dp_lgl)
        )  # must be <class 'datetime.datetime'> ('renderer=isoformat')
        self.log.info(self.who, "vol_isbn, type()               : ", vol_isbn,
                      type(vol_isbn))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_genre, type()              : ", vol_genre,
            type(vol_genre))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who, "vol_cover_index, type()        : ",
                      vol_cover_index, type(vol_cover_index))  # must be
        self.log.info(self.who, "type(vol_comment_soup)         : ",
                      type(vol_comment_soup)
                      )  # must be byte encoded (start with b'blablabla...
        #        self.log.info(self.who,"vol_comment_soup               :\n",vol_comment_soup)                                # Maybe a bit long sometimes
        # language must be <class 'str'>

        if vol_cover_index:
            self.plugin.cache_identifier_to_cover_url(self.nsfr_id,
                                                      vol_cover_index)

        if vol_isbn:
            self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id)

        mi = Metadata(vol_title, [vol_auteur])
        mi.set_identifier('nsfr_id', self.nsfr_id)
        mi.publisher = vol_editor
        mi.isbn = vol_isbn
        mi.tags = [vol_genre]
        mi.source_relevance = self.relevance
        mi.has_cover = bool(vol_cover_index)
        if vol_dp_lgl:
            mi.pubdate = vol_dp_lgl
        if vol_serie:
            mi.series = vol_serie
            mi.series_index = vol_serie_seq
        mi.language = "fra"

        mi.comments = vol_comment_soup

        if debug: self.log.info(self.who, "mi\n", mi, "\n")
        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)