def build_meta(log, issue_id):
    """Build metadata record based on comicvine issue_id"""
    issue = pycomicvine.Issue(
        issue_id,
        field_list=[
            "id",
            "name",
            "volume",
            "issue_number",
            "person_credits",
            "description",
            "store_date",
            "cover_date",
        ],
    )
    if not issue or not issue.volume:
        log.warn("Unable to load Issue(%d)" % issue_id)
        return None
    title = "%s #%s" % (issue.volume.name, issue.issue_number)
    if issue.name:
        title = title + ": %s" % (issue.name)
    authors = [p.name for p in issue.person_credits]
    meta = Metadata(title, authors)
    meta.series = issue.volume.name
    meta.series_index = str(issue.issue_number)
    meta.set_identifier("comicvine", str(issue.id))
    meta.comments = issue.description
    meta.has_cover = False
    if issue.volume.publisher:
        meta.publisher = issue.volume.publisher.name
    meta.pubdate = issue.store_date or issue.cover_date
    return meta
Beispiel #2
0
    def run(self):
        try:
            self.log.info('Worker parsing url: %r' % self.url)
            
            book = Book.from_url(self.browser, self.url, self.timeout, self.log)
            
            if not book.get("title") or not book.get("authors"):
                self.log.error('Insufficient metadata found for %r' % self.url)
                return
            
            title = book["title"].encode('utf-8')
            authors = [a.encode('utf-8') for a in book["authors"]]

            mi = Metadata(title, authors)
            
            isbn = book.get("ean") or book.get("isbn")
            if isbn:
                mi.set_identifier("isbn", isbn)
            
            for attr in ("pubdate", "rating", "languages"):
                if attr in book:
                    setattr(mi, attr, book[attr])
            
            if book.get("publisher"):
                mi.publisher = book["publisher"].encode('utf-8')
                    
            if book.get("cover_url"):
                self.plugin.cache_identifier_to_cover_url(isbn, book["cover_url"])
                mi.has_cover = True

            self.plugin.clean_downloaded_metadata(mi)
            self.result_queue.put(mi)
        except Exception as e:
            self.log.exception('Worker failed to fetch and parse url %r with error %r' % (self.url, e))
    def _get_bookdetails(self, url):
        u = self.BASE_URL + url["url"]
        print("_get_bookdetails:: traukiam knygą iš %s" % u)

        resp = urllib2.urlopen(u)
        contents = resp.read()
        #print(contents)
        tree = etree.HTML(contents)

        authors = self._get_authors(tree)
        publisher = self._get_details(tree, self.details_publisher)
        year = self._get_year(tree)
        pages = self._get_details(tree, self.details_pages)
        isbn = self._get_details(tree, self.details_isbn)
        description = self._get_description(tree)
        cover = self._get_cover_url(tree)
        tags = self._get_tags(tree)

        mi = Metadata(url["title"], authors)
        mi.set_identifier("isbn", isbn)
        mi.comments = description
        mi.language = "LT"
        mi.tags = tags
        try:
            mi.set("publisher", publisher)
        except:
            print(u"_get_bookdetails:: nepavyko užsetinti leidėjo")
        try:
            mi.set("pubdate", datetime.datetime(year, 1, 2))
        except:
            print(u"_get_bookdetails:: nepavyko užsetinti leidimo datos")
        try:
            if self.gui:
                print("YYYYRAAA GUI!!!")
            col = {}
            col["#value#"] = pages
            mi.set_user_metadata("#count", col)
        except:
            print(u"_get_bookdetails:: nepavyko užsetinti puslapių skaičiaus")

        if cover and isbn:
            print(u"_get_bookdetails:: kešuojam viršelį:", cover)
            self.cache_isbn_to_identifier(isbn, isbn)
            self.cache_identifier_to_cover_url(isbn, cover)
            mi.has_cover = True

            print(self.cached_identifier_to_cover_url(isbn))

        return mi
Beispiel #4
0
def build_meta(log, issue_id):
    """Build metadata record based on comicvine issue_id."""
    issue = PyComicvineWrapper(log).lookup_issue(issue_id)
    if issue:
        meta = Metadata(issue.get_full_title(), issue.get_authors())
        meta.series = issue.volume_name
        meta.series_index = issue.issue_number
        meta.set_identifier('comicvine', str(issue.id))
        meta.set_identifier('comicvine-volume', str(issue.volume_id))
        meta.comments = issue.description
        meta.has_cover = False
        meta.publisher = issue.publisher_name
        meta.pubdate = issue.date
        return meta
    else:
        return None
Beispiel #5
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import UNDEFINED_DATE
        root = parse_html(raw)
        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])

        # Identifiers
        if self.basic_data['isbns']:
            mi.isbn = self.basic_data['isbns'][0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        if self.basic_data['tags']:
            mi.tags = self.basic_data['tags']
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        mi.publisher = self.basic_data['publisher']

        # Pubdate
        if self.basic_data['pubdate'] and self.basic_data[
                'pubdate'].year != UNDEFINED_DATE:
            mi.pubdate = self.basic_data['pubdate']

        # Rating
        if self.basic_data['rating']:
            mi.rating = self.basic_data['rating']

        # Comments
        comments = ''
        for cid in ('summary', 'contributorbio', 'quotes_reviews'):
            cid = 'desc_{}{}-content'.format(cid, self.sku)
            div = root.xpath('//*[@id="{}"]'.format(cid))
            if div:
                comments += self.render_comments(div[0])
        if comments:
            mi.comments = comments

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None
        return mi
Beispiel #6
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import UNDEFINED_DATE
        root = parse_html(raw)
        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])

        # Identifiers
        if self.basic_data['isbns']:
            mi.isbn = self.basic_data['isbns'][0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        if self.basic_data['tags']:
            mi.tags = self.basic_data['tags']
            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]

        # Publisher
        mi.publisher = self.basic_data['publisher']

        # Pubdate
        if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:
            mi.pubdate = self.basic_data['pubdate']

        # Rating
        if self.basic_data['rating']:
            mi.rating = self.basic_data['rating']

        # Comments
        comments = ''
        for cid in ('summary', 'contributorbio', 'quotes_reviews'):
            cid = 'desc_{}{}-content'.format(cid, self.sku)
            div = root.xpath('//*[@id="{}"]'.format(cid))
            if div:
                comments += self.render_comments(div[0])
        if comments:
            mi.comments = comments

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None
        return mi
Beispiel #7
0
def build_meta(log, issue_id):
  '''Build metadata record based on comicvine issue_id'''
  issue = pycomicvine.Issue(issue_id, field_list=[
      'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 
      'store_date', 'cover_date'])
  if not issue or not issue.volume:
    log.warn('Unable to load Issue(%d)' % issue_id)
    return None
  title = '%s #%s' %  (issue.volume.name, issue.issue_number)
  if issue.name:
    title = title + ': %s' % (issue.name)
  authors = [p.name for p in issue.person_credits]
  meta = Metadata(title, authors)
  meta.series = issue.volume.name
  meta.series_index = str(issue.issue_number)
  meta.set_identifier('comicvine', str(issue.id))
  meta.set_identifier('comicvine-volume', str(issue.volume.id))
  meta.comments = issue.description
  meta.has_cover = False
  if issue.volume.publisher:
    meta.publisher = issue.volume.publisher.name
  meta.pubdate = issue.store_date or issue.cover_date
  return meta
Beispiel #8
0
	def parse_details(self, root):
		search_data = ''
		isbn = None
		
		try:
			self.log.info('Parse details:%s'%self.url)
			databazeknih_id = self.parse_databazeknih_id(self.url)
			self.log.info('Parsed DK identifier:%s'%databazeknih_id)
		except:
			self.log.exception('Error parsing databazeknih id for url: %r'%self.url)
			databazeknih_id = None

#		self.log.info('11')
		try:
			title = self.parse_title(root)
			self.log.info('Parsed title:%s'%title)
		except:
			self.log.exception('Error parsing title for url: %r'%self.url)
			title = None
		
		try:
			authors = self.parse_authors(root)
			self.log.info('Parsed authors:%s'%authors)
		except:
			self.log.exception('Error parsing authors for url: %r'%self.url)
			authors = []

		if not title or not authors or not databazeknih_id:
			self.log.error('Could not find title/authors/databazeknih id for %r'%self.url)
			self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors))
			return

		mi = Metadata(title, authors)
		self.log.info('dbki:%s'%databazeknih_id)
		mi.set_identifier('databazeknih', databazeknih_id)
		self.databazeknih_id = databazeknih_id

		try:
			(mi.series, mi.series_index) = self.parse_series(root)
			self.log.info('Parsed series:%s'%mi.series)
			self.log.info('Parsed series index:%s'%mi.series_index)
		except :
			self.log.exception('Error parsing series for url: %r'%self.url)
			series = None
			
		try:
			mi.comments = self.parse_comments(root)
			self.log.info('Parsed comments:%s'%mi.comments)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
			self.log.info('Parsed URL for cover:%r'%self.cover_url)
			self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		mi.has_cover = bool(self.cover_url)

		try:
			mi.tags = self.parse_tags(root)
			self.log.info('Parsed tags:%s'%mi.tags)
		except:
			self.log.exception('Error parsing tags for url: %r'%self.url)
			
		try:
			mi.publisher = self.parse_publisher(root)
			self.log.info('Parsed publisher:%s'%mi.publisher)
		except:
			self.log.exception('Error parsing publisher for url: %r'%self.url)
			
		try:
			mi.pubdate = self.parse_pubdate(root)
			self.log.info('Parsed pubdate:%s'%mi.pubdate)
		except:
			self.log.exception('Error parsing pubdate for url: %r'%self.url)

			
		try:
			mi.rating = self.parse_rating(root)
			self.log.info('Parsed rating:%s'%mi.rating)
		except:
			self.log.exception('Error parsing rating for url: %r'%self.url)

		mi.source_relevance = self.relevance

#		if series:
#			mi.series = series
		
		try:
			isbn = self.parse_isbn(root)
			if isbn:
				self.isbn = mi.isbn = isbn
		except:
			self.log.exception('Error parsing ISBN for url: %r'%self.url)

		if self.databazeknih_id:
			self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id)
			
#		self.plugin.clean_downloaded_metadata(mi)
#		mi.isbn = check_isbn(mi.isbn)
		self.log.info(mi)
		self.result_queue.put(mi)
Beispiel #9
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title = self._field_for('title',
                                   book_id,
                                   default_value=_('Unknown'))
        mi.authors = aum
        mi.author_sort = self._field_for('author_sort',
                                         book_id,
                                         default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments = self._field_for('comments', book_id)
        mi.publisher = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid = self._field_for('uuid', book_id, default_value='dummy')
        mi.title_sort = self._field_for('sort',
                                        book_id,
                                        default_value=_('Unknown'))
        mi.book_size = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice',
                                          book_id,
                                          default_value='')
        mi.last_modified = self._field_for('last_modified',
                                           book_id,
                                           default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for(
            'cover', book_id, default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index',
                                              book_id,
                                              default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(
            self._field_for('identifiers', book_id, default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key + '_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name, cat, ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name, cat])
                    elif name == v:
                        res.append([name, cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #10
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title       = self._field_for('title', book_id,
                default_value=_('Unknown'))
        mi.authors     = aum
        mi.author_sort = self._field_for('author_sort', book_id,
                default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments    = self._field_for('comments', book_id)
        mi.publisher   = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp   = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate     = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid        = self._field_for('uuid', book_id,
                default_value='dummy')
        mi.title_sort  = self._field_for('sort', book_id,
                default_value=_('Unknown'))
        mi.book_size   = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice', book_id, default_value='')
        mi.last_modified = self._field_for('last_modified', book_id,
                default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for('cover', book_id,
                default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index', book_id,
                    default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(self._field_for('identifiers', book_id,
            default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key+'_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name,cat,ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name,cat])
                    elif name == v:
                        res.append([name,cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #11
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r'%pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None

        return mi
Beispiel #12
0
    def parse_book_page(self, url):
        # TODO: Support for login-based rating fetching
        # TODO: Move all parsing logic to methods in order to avoid dangling variables
        # TODO: Saving metadata in custom columns
        # TODO: Configurable embedding metadata in comment
        # TODO: missing items
        # original language, first polish publish date, publisher serie, form

        self.log.info('INFO: Downloading book page: {}'.format(url))
        root_tag = self.get_lxml_root(url)

        if not root_tag:
            return None

        book_tag = self.get_book_tag(root_tag)

        if self.prefs['title']:
            book_title = self.parse_title(root_tag, book_tag, url)
        else:
            book_title = self.title

        if self.prefs['authors']:
            book_authors = self.parse_authors(root_tag, book_tag, url)
        else:
            book_authors = self.authors

        mi = Metadata(book_title, book_authors)
        additional_meta = {}

        if self.enabled('languages'):
            languages = self.parse_languages(root_tag, book_tag, url)
            if languages:
                mi.languages = languages

        if self.enabled('rating'):
            rating = self.parse_rating(root_tag, book_tag, url)
            if rating != None:
                mi.rating = rating

        if self.enabled('tags'):
            tags = self.parse_tags(root_tag, book_tag, url)
            if tags:
                mi.tags = tags

        if self.enabled('identifier'):
            identifier = self.parse_identifier(root_tag, book_tag, url)
            if identifier:
                mi.set_identifier(IDENTIFIER, identifier)

        if self.enabled('pubdate'):
            pubdate = self.parse_pubdate(root_tag, book_tag, url)
            if pubdate:
                mi.pubdate = pubdate

        if self.enabled('covers'):
            covers = self.parse_covers(root_tag, book_tag, url)
            if covers:
                mi.has_cover = True
                self.plugin.cached_identifier_to_cover_url('urls').extend(
                    covers)
            else:
                self.plugin.cache_identifier_to_cover_url('nocover', True)
                # TODO: is this necessary?

        if self.enabled('series'):
            series = self.parse_series(root_tag, book_tag, url)
            if series:
                additional_meta['series'] = [
                    self.get_series_string(name, index)
                    for name, index in series
                ]
                name, index = series[0]
                mi.series = name
                if index is not None:
                    mi.series_index = index

        if self.enabled('translators'):
            translators = self.parse_translators(root_tag, book_tag, url)
            if translators:
                additional_meta['translators'] = translators

        if self.enabled('original_title'):
            original_title = self.parse_original_title(root_tag, book_tag, url)
            if original_title:
                additional_meta['original_title'] = original_title

        if self.enabled('categories'):
            categories = self.parse_categories(root_tag, book_tag, url)
            if categories:
                additional_meta['categories'] = categories

        if self.enabled('genres'):
            genres = self.parse_genres(root_tag, book_tag, url)
            if genres:
                additional_meta['genres'] = genres

        if self.enabled('comments'):
            comments = self.parse_comments(root_tag, book_tag, url) or ''
            additional_comments = self.format_additional_comment(
                additional_meta)

            if comments or additional_comments:
                mi.comments = comments + additional_comments

        self.log.info('INFO: Parsing book page completed')

        return mi
Beispiel #13
0
    def parse_details(self, root):
        try:
            legie_id = self.parse_legie_id(self.url)
        except:
            self.log.exception('Error parsing Legie id for url: %r' % self.url)
            legie_id = None

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not legie_id:
            self.log.error('Could not find title/authors/Legie id for %r' %
                           self.url)
            self.log.error('Legie: %r Title: %r Authors: %r' %
                           (legie_id, title, authors))
            return

        self.legie_id = legie_id

        rating = comments = series = series_index = None
        try:
            rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            (series, series_index) = self.parse_series(root)
        except:
            self.log.info('Series not found.')

        try:
            tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)
            tags = None

        if legie_id:
            editions = self.get_editions()

            if editions:
                num_editions = len(editions)
                self.log.info('Nalezeno %d vydani' % num_editions)
                for edition in editions:
                    (year, cover_url, publisher, isbn) = edition
                    mi = Metadata(title, authors)
                    self.legie_id = "%s#%s" % (legie_id, year)
                    mi.set_identifier('legie', self.legie_id)
                    mi.source_relevance = self.relevance
                    mi.rating = rating
                    mi.comments = comments
                    mi.series = series
                    mi.series_index = series_index
                    if cover_url:
                        mi.cover_url = self.cover_url = cover_url
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
                    if tags:
                        mi.tags = tags
                    mi.has_cover = bool(self.cover_url)
                    mi.publisher = publisher
                    mi.isbn = isbn
                    mi.pubdate = self.prepare_date(int(year))
                    mi.language = "ces"
                    self.result_queue.put(mi)
            else:
                mi = Metadata(title, authors)
                mi.set_identifier('legie', self.legie_id)
                mi.source_relevance = self.relevance
                mi.rating = rating
                mi.comments = comments
                mi.series = series
                mi.series_index = series_index
                try:
                    self.cover_url = self.parse_cover(root)
                except:
                    self.log.exception('Error parsing cover for url: %r' %
                                       self.url)
                if tags:
                    mi.tags = tags
                mi.has_cover = bool(self.cover_url)
                mi.publisher = publisher
                mi.isbn = isbn
                mi.pubdate = self.prepare_date(int(year))
                mi.language = "ces"
                self.result_queue.put(mi)
                if self.legie_id:
                    if self.cover_url:
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [_.get('content') for _ in node if _.get('property') == property][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6]) 
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]')

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath('//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #15
0
    def parse_details(self, raw, root):
        #解析元数据各字段数据
        #self.log.info("=====")
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r' % self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(asin or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', asin, 'saved in', f.name)
        # 分析取得书名
        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None
        #分析取得作者
        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (asin, title, authors))
            return
        #以书名,作者为元数据对象mi,用于设置元数据
        mi = Metadata(title, authors)
        #设置Bookid
        idtype = '17k'
        mi.set_identifier(idtype, asin)
        self.k17k_id = asin

        #设备注释(简介)
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
        #设置丛书系列
        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
        #设置标签
        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        #设置最后更新日期
#        try:
#            mi.last_modified = self.parse_last_modified(root)
#        except:
#            self.log.exception('Error parsing last_modified for url: %r'%self.url)
#设置封面
        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        mi.has_cover = bool(self.cover_url)
        mi.source_relevance = self.relevance
        mi.languages = [
            u'中文',
        ]

        if self.k17k_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.k17k_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #16
0
#!/usr/bin/env python
    def parse_details(self, root):
        try:
            isbn = self.extract_isbn(self.url)
        except:
            self.log.exception('No ISBN in URL: %r'%self.url)
            isbn = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not isbn:
            self.log.error('Could not find title/authors/Aladin id for %r'%self.url)
            self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.set_identifier('isbn', isbn)
        mi.isbn = isbn
        self.isbn = isbn

        # ISBN-13
        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!
        if mi.has_cover:
            self.log.info('Cover URL: '+mi.cover_url)

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #18
0
    def parse_details(self, root):
        try:
            antik_id = self.parse_antik_id(root)
            self.log.info('Parsed Antikvarium identifier: %s' % antik_id)
        except:
            self.log.exception('Error parsing Antikvarium id for url: %r' %
                               self.url)
            antik_id = None

        try:
            title = self.parse_title(root)
            self.log.info('Parsed title: %s' % title)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
            self.log.info('Parsed authors: %s' % authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not antik_id:
            self.log.error(
                'Could not find title/authors/Antikvarium.hu id for %r' %
                self.url)
            self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' %
                           (antik_id, title, authors))
            return

        mi = Metadata(title, authors)
        mi.set_identifier('antik_hu', antik_id)
        self.antik_id = antik_id

        try:
            isbn = self.parse_isbn(root)
            self.log.info('Parsed ISBN: %s' % isbn)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            series = self.parse_series(root)
            self.log.info('Parsed series: %s' % series)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            series = None

        try:
            mi.series_index = self.parse_series_index(root)
            self.log.info('Parsed series index: %s' % mi.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            mi.series_index = None

        try:
            mi.comments = self.parse_comments(root)
            self.log.info('Parsed comments: %s' % mi.comments)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
            self.log.info('Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.antik_id,
                                                      self.cover_url)
            mi.has_cover = bool(self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        try:
            mi.publisher = self.parse_publisher(root)
            self.log.info('Parsed publisher: %s' % mi.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            mi.tags = self.parse_tags(root)
            self.log.info('Parsed tags: %s' % mi.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
            self.log.info('Parsed publication date: %s' % mi.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        try:
            mi.languages = self.parse_languages(root)
            self.log.info('Parsed languages: %r' % mi.languages)
        except:
            self.log.exception('Error parsing languages for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        if series:
            mi.series = series

        if self.antik_id and self.isbn:
            self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r' %
                               self.url)
            kyobobook_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r' %
                               self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r' %
                           self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r' %
                           (kyobobook_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r' %
                               self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn,
                                                     self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.kyobobook_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #20
0
    def parse_details(self, root):
        try:
            goodreads_id = self.parse_goodreads_id(self.url)
        except:
            self.log.exception("Error parsing goodreads id for url: %r" % self.url)
            goodreads_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception("Error parsing title and series for url: %r" % self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception("Error parsing authors for url: %r" % self.url)
            authors = []

        if not title or not authors or not goodreads_id:
            self.log.error("Could not find title/authors/goodreads id for %r" % self.url)
            self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier("goodreads", goodreads_id)
        self.goodreads_id = goodreads_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception("Error parsing ISBN for url: %r" % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception("Error parsing ratings for url: %r" % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception("Error parsing comments for url: %r" % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception("Error parsing cover for url: %r" % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception("Error parsing tags for url: %r" % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception("Error parsing publisher and date for url: %r" % self.url)

        mi.source_relevance = self.relevance

        if self.goodreads_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #21
0
    def parse(self, raw, desc_raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_date, utcnow
        import json

        root = parse_html(raw.decode('gb18030'))
        title = root.xpath('//*[@id="name"]/div[1]/text()')
        title = title[0].strip()
        authors = []
        for i in root.xpath('//*[@id="p-author"]/a'):
            authors.append(i.text.strip())
        mi = Metadata(title, authors)

        information = root.xpath('//*[@id="parameter2"]/li')
        info = dict()
        for i in information:
            tmp = etree.tostring(i, method='text',
                                 encoding='utf-8').split(u':')
            info[tmp[0].strip()] = tmp[1].strip()
        # Identifiers
        mi.identifiers = self.plugin.identifiers
        mi.identifiers['jd'] = self.sku
        isbn = info['ISBN']
        self.log.error(isbn)
        if isbn:
            mi.isbn = isbn
            self.plugin.cache_isbn_to_identifier(isbn, self.sku)
            mi.identifiers['isbn'] = isbn

        # Publisher
        mi.publisher = info.get(u'出版社')

        # Pubdate
        pubdate = info.get(u'出版时间')
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                self.log.error('Failed to parse pubdate %r' % pubdate)

        # Series
        mi.series = info.get(u'丛书名')

        img = root.xpath('//*[@id="spec-n1"]/img')
        cover = img[0].get('src')
        if cover:
            if not cover.startswith('http'):
                cover = 'https:' + cover
            self.plugin.cache_identifier_to_cover_url(self.sku, cover)
        self.log.error(cover)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        # Comments
        # showdesc({"date":1583588455348,"content":" ... "})
        try:
            desc = json.loads(desc_raw[9:-1].decode('gb18030'))
            desc_root = parse_html(desc['content'])
            div = desc_root.xpath(
                '//*[@id="detail-tag-id-3"]/div[2]/div/text()')

            comments = div[0]
            mi.comments = comments
        finally:
            return mi
Beispiel #22
0
    def parse_details(self, raw, root):
        dang_id = parse_dang_id(root, self.log, self.url)
        if not dang_id and root.xpath(
                '//form[@action="/errors/validateCaptcha"]'):
            raise CaptchaError(
                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.'
            )
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(dang_id or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', dang_id, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not dang_id:
            self.log.error('Could not find title/authors/dang_id for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (dang_id, title, authors))
            return

        mi = Metadata(title, authors)
        idtype = 'dang'
        mi.set_identifier(idtype, dang_id)
        self.dang_id = dang_id

        try:
            mi.comments = self.parse_comments(root, raw)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_url)

        pd = root.xpath(self.pd_desc_xpath)
        pd_info = root.xpath(self.pd_info_xpath)
        pd_info_store = root.xpath(self.pd_info_store_xpath)
        pd_desc = root.xpath(self.pd_desc_xpath)

        if pd_info or pd_info_store:
            try:
                isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc)
                if isbn:
                    self.isbn = mi.isbn = isbn
            except:
                self.log.exception('Error parsing ISBN for url: %r' % self.url)

            if pd_info:
                pd_info = pd_info[0]
            else:
                pd_info = pd_info_store[0]

            try:
                mi.publisher = self.parse_publisher(pd_info)
            except:
                self.log.exception('Error parsing publisher for url: %r' %
                                   self.url)

            try:
                mi.pubdate = self.parse_pubdate(pd_info)
            except:
                self.log.exception('Error parsing publish date for url: %r' %
                                   self.url)

        else:
            self.log.warning('Failed to find product description for url: %r' %
                             self.url)

        mi.source_relevance = self.relevance

        if self.dang_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.dang_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #23
0
    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #24
0
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [
                _.get('content') for _ in node if _.get('property') == property
            ][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6])
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath(
                '//meta[starts-with(@property, "og") or starts-with(@property, "books")]'
            )

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath(
                '//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([
                _ + u'(역자)'
                for _ in _format_list(book_info['translator']['name'])
            ])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #25
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE

        root = parse_html(raw)
        sku = CSSSelect("div.sku.attGroup")(root)[0]
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find("div")
        spans = banner.findall("span")
        title = ""
        for i, span in enumerate(spans):
            if i == 0 or "12pt" in span.get("style", ""):
                title += astext(span)
            else:
                break
        authors = [re.sub(r"\(.*\)", "", x).strip() for x in astext(spans[-1]).split(",")]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(",")]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier("edelweiss", self.sku)

        # Tags
        bisac = CSSSelect("div.bisac.attGroup")(root)
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(",")]
            mi.tags = [t[1:].strip() if t.startswith("&") else t for t in mi.tags]

        # Publisher
        pub = CSSSelect("div.supplier.attGroup")(root)
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = CSSSelect("div.shipDate.attGroupItem")(root)
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(":")[0::2]
            pub = parts[1] or parts[0]
            try:
                if ", Ship Date:" in pub:
                    pub = pub.partition(", Ship Date:")[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception("Error parsing published date: %r" % pub)

        # Comments
        comm = ""
        general = CSSSelect("div#pd-general-overview-content")(root)
        if general:
            q = self.render_comments(general[0])
            if q != "<p>No title summary available. </p>":
                comm += q
        general = CSSSelect("div#pd-general-contributor-content")(root)
        if general:
            comm += self.render_comments(general[0])
        general = CSSSelect("div#pd-general-quotes-content")(root)
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = CSSSelect("img.title-image[src]")(root)
        if img:
            href = img[0].get("src").replace("jacket_covers/medium/", "jacket_covers/flyout/")
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None

        return mi
Beispiel #26
0
    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            CBDB_id = self.parse_CBDB_id(self.url)
        except:
            self.log.exception('Error parsing CBDB id for url: %r' % self.url)
            CBDB_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r' %
                               self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not CBDB_id:
            self.log.error('Could not find title/authors/CBDB id for %r' %
                           self.url)
            self.log.error('CBDB: %r Title: %r Authors: %r' %
                           (CBDB_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.identifiers['cbdb'] = CBDB_id
        mi.set_identifier('cbdb', CBDB_id)
        #self.log.info(CBDB_id)
        #self.log.info(mi.identifiers.get('cbdb', None))
        self.CBDB_id = CBDB_id

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        # summary
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_urls = self.parse_covers(root)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_urls)
        #self.log.info('covers')
        #self.log.info(self.cover_urls)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.publisher, mi.pubdate, isbn = self.parse_editions(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing publisher and date for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        mi.language = 'Czech'

        #self.log.info('self.CBDB_id = ' + str(self.CBDB_id ))

        if self.CBDB_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id)

            if self.cover_urls:
                self.plugin.cache_identifier_to_cover_url(
                    self.CBDB_id, self.cover_urls)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #28
0
	def parse_details(self, root):
		isfdb_id = None
		title = None
		authors = []
		isbn = None
		publisher = None
		pubdate = None
		
		try:
			isfdb_id = re.search('(\d+)$', self.url).groups(0)[0]
		except:
			self.log.exception('Error parsing ISFDB ID for url: %r' % self.url)
        
		detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li')
		if not detail_nodes:
			detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image)

		for detail_node in detail_nodes:
			section = detail_node[0].text_content().strip().rstrip(':')
			#self.log.info(section)
			try:
				if section == 'Publication':
					title = detail_node[0].tail.strip()
					if not title:
						# assume an extra span with a transliterated title tooltip
						title = detail_node[1].text_content().strip()
                    #self.log.info(title)
				elif section == 'Authors' or section == 'Editors':
					for a in detail_node.xpath('.//a'):
						author = a.text_content().strip()
						if section.startswith('Editors'):
							authors.append(author + ' (Editor)')
						else:
							authors.append(author)
                    #self.log.info(authors)
				elif section == 'ISBN':
					isbn = detail_node[0].tail.strip('[] \n')
                    #self.log.info(isbn)
				elif section == 'Publisher':
					publisher = detail_node.xpath('a')[0].text_content().strip()
                    #self.log.info(publisher)
				elif section == 'Date':
					pubdate = self._convert_date_text(detail_node[0].tail.strip())
                    #self.log.info(pubdate)
			except:
				self.log.exception('Error parsing section %r for url: %r' % (section, self.url) )

		if not title or not authors or not isfdb_id:
			self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url)
			self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title,
				authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('isfdb', isfdb_id)
		self.isfdb_id = isfdb_id

		if isbn:
			self.isbn = mi.isbn = isbn
		if publisher:
			mi.publisher = publisher
		if pubdate:
			mi.pubdate = pubdate
			
		try:
			mi.comments = self.parse_comments(root)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		
		mi.has_cover = bool(self.cover_url)
		mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

		mi.source_relevance = self.relevance

		if self.isfdb_id:
			if self.isbn:
				self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id)

		self.plugin.clean_downloaded_metadata(mi)
		self.result_queue.put(mi)
Beispiel #29
0
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r'%self.url)
            kyobobook_id = None
        
        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r'%self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r'%self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.kyobobook_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            yes24_id = self.parse_yes24_id(self.url)
        except:
            self.log.exception('Error parsing YES24 id for url: %r'%self.url)
            yes24_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not yes24_id:
            self.log.error('Could not find title/authors/YES24 id for %r'%self.url)
            self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('yes24', yes24_id)
        self.yes24_id = yes24_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        if self.yes24_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #31
0
	def parse_details(self, root):
		try:
			moly_id = self.parse_moly_id(self.url)
			self.log.info('Parsed moly.hu identifier: %s'%moly_id)
		except:
			self.log.exception('Error parsing moly.hu id for url: %r'%self.url)
			moly_id = None

		try:
			title = self.parse_title(root)
			self.log.info('Parsed title: %s'%title)
		except:
			self.log.exception('Error parsing title for url: %r'%self.url)
			title = None
		
		try:
			authors = self.parse_authors(root)
			self.log.info('Parsed authors: %s'%authors)
		except:
			self.log.exception('Error parsing authors for url: %r'%self.url)
			authors = []

		if not title or not authors or not moly_id:
			self.log.error('Could not find title/authors/moly.hu id for %r'%self.url)
			self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('moly_hu', moly_id)
		self.moly_id = moly_id

		try:
			isbn = self.parse_isbn(root)
			self.log.info('Parsed ISBN: %s'%isbn)
			if isbn:
				self.isbn = mi.isbn = isbn
		except:
			self.log.exception('Error parsing ISBN for url: %r'%self.url)
		
		try:
			series_info = self.parse_series(root)
			if series_info is not None:
				mi.series = series_info[0]
				mi.series_index = int(series_info[1])
				self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index))
		except:
			self.log.exception('Error parsing series for url: %r'%self.url)
			
		try:
			mi.comments = self.parse_comments(root)
			self.log.info('Parsed comments: %s'%mi.comments)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_covers(root)
			self.log.info('Parsed URL for cover: %r'%self.cover_url)
			self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url)
			mi.has_cover = bool(self.cover_url)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)

		try:
			mi.tags = self.parse_tags(root)
			self.log.info('Parsed tags: %s'%mi.tags)
		except:
			self.log.exception('Error parsing tags for url: %r'%self.url)
			
		try:
			mi.languages = self.parse_languages(mi.tags)
			self.log.info('Parsed languages: %r'%mi.languages)
		except:
			self.log.exception('Error parsing language for url: %r'%self.url)
			
		try:
			mi.publisher = self.parse_publisher(root)
			self.log.info('Parsed publisher: %s'%mi.publisher)
		except:
			self.log.exception('Error parsing publisher for url: %r'%self.url)	
			
		try:
			mi.pubdate = self.parse_published_date(root)
			self.log.info('Parsed publication date: %s'%mi.pubdate)
		except:
			self.log.exception('Error parsing published date for url: %r'%self.url)
			
		try:
			mi.rating = self.parse_rating(root)
			self.log.info('Parsed rating: %s\n\n'%mi.rating)
		except:
			self.log.exception('Error parsing tags for url: %r\n\n'%self.url)


		mi.source_relevance = self.relevance

		if self.moly_id and self.isbn:
			self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id)

		self.plugin.clean_downloaded_metadata(mi)

		self.result_queue.put(mi)
Beispiel #32
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [
            re.sub(r'\(.*\)', '', x).strip()
            for x in astext(spans[-1]).split(',')
        ]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r' % pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        return mi
Beispiel #33
0
    def extract_vol_details(self, vol_url):
        # Here we extract and format the information from the choosen volume.
        # - The first name and last name to populate author and author sort : vol_auteur_prenom  and vol_auteur_nom
        # - The title of the volume                                         : vol_title
        # - The serie name the volume is part of                            : vol_serie
        # - The sequence number in the serie                                : vol_serie_seq                         # missing
        # - The editor of this volume                                       : vol_editor
        # - The editor's collection of this volume                          : vol_coll
        # - The collection serial code of this volume                       : vol_coll_srl
        # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl                            # date format to be computed
        # - The ISBN number assoi-ciated with the volume                    : vol_isbn
        # - The volume tags                                                 : vol_genre
        # - The url pointer to the volume cover image                       : vol_cover_index
        # - The comments includes various info about the book               : vol_comment_soup
        #   . reference, an url pointer to noosfere
        #   . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume
        #   . first edition information
        #   . serie (cycle) name and number
        #   . this volume editor info
        #   . Resume (quatrième de couverture)
        #   . Critiques
        #   . Sommaire detailing what novels are in the volume when it is an anthology
        #   . Critiques about the serie and/or about another volume of the book
        #

        debug = self.dbg_lvl & 2
        self.log.info(self.who, "\nIn extract_vol_details(soup)")
        if debug:
            self.log.info(self.who, "vol_url       : ", vol_url)

        if debug:
            self.log.info(
                self.who,
                "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')"
            )
            self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who)
        rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who)
        soup = rsp[0]
        url_vrai = rsp[1].replace("&Tri=3", "")
        #        if debug: self.log.info(self.who,soup.prettify())              # useful but too big...

        self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace(
            '?', '&').replace('=', '&').split('&')[2]
        # self.nsfr_id = (self.nfsr_id).strip("$")                        # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ???
        tmp = self.nsfr_id
        self.nsfr_id = tmp.strip('$')

        if debug:
            self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id,
                          type(self.nsfr_id))

        tmp_lst = []
        vol_info = {}
        vol_title = ""
        vol_auteur = ""
        vol_auteur_prenom = ""
        vol_auteur_nom = ""
        vol_serie = ""
        vol_serie_seq = ""
        vol_editor = ""
        vol_coll = ""
        vol_coll_srl = ""
        vol_dp_lgl = ""
        vol_isbn = ""
        vol_genre = ""
        vol_cover_index = ""
        comment_generic = None
        comment_resume = None
        comment_Critiques = None
        comment_Sommaire = None
        comment_AutresCritique = None
        comment_cover = None
        comment_decoupage_annexe = None

        # add volume address as a reference in the comment
        vol_comment_soup = BS(
            '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai +
            '</a></p></div>', "lxml")
        if debug: self.log.info(self.who, "vol reference processed")

        if soup.select("span[class='TitreNiourf']"):
            vol_title = soup.select(
                "span[class='TitreNiourf']")[0].text.strip()
        if debug: self.log.info(self.who, "vol_title processed : ", vol_title)

        if soup.select("span[class='AuteurNiourf']"):
            vol_auteur = soup.select(
                "span[class='AuteurNiourf']")[0].text.replace("\n",
                                                              "").strip()
        if debug:
            self.log.info(self.who, "vol_auteur processed : ", vol_auteur)
        for i in range(len(vol_auteur.split())):
            if not vol_auteur.split()[i].isupper():
                vol_auteur_prenom += " " + vol_auteur.split()[i]
            else:
                vol_auteur_nom += " " + vol_auteur.split()[i].title()
        vol_auteur = vol_auteur.title()
        vol_auteur_prenom = vol_auteur_prenom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_prenom processed : ",
                          vol_auteur_prenom)
        vol_auteur_nom = vol_auteur_nom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_nom processed : ",
                          vol_auteur_nom)

        if soup.select("a[href*='serie.asp']"):
            if soup.select("a[href*='serie.asp']")[0].find_parent(
                    "span", {"class": "ficheNiourf"}):
                vol_serie = soup.select("a[href*='serie.asp']")[0].text
                tmp_vss = [
                    x for x in soup.select("a[href*='serie.asp']")
                    [0].parent.stripped_strings
                ]
                for i in range(len(tmp_vss)):
                    if "vol." in tmp_vss[i]:
                        if not vol_serie_seq:
                            vol_serie_seq = tmp_vss[i].replace("vol.",
                                                               "").strip()
                    if "découpage" in tmp_vss[i]:
                        dec_anx_url = "https://www.noosfere.org/livres/" + soup.select(
                            "a[href*='serie.asp']")[0]['href']
                        comment_pre_decoupage_annexe = BS(
                            '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>',
                            "lxml")
                        comment_decoupage_annexe = self.get_decoupage_annexe(
                            dec_anx_url)
                if debug:
                    self.log.info(self.who,
                                  "vol_serie, vol_serie_seq processed : ",
                                  vol_serie, ",", vol_serie_seq)

        comment_generic = soup.select("span[class='ficheNiourf']")[0]
        new_div = soup.new_tag('div')
        comment_generic = comment_generic.wrap(new_div)
        if debug: self.log.info(self.who, "comment_generic processed")

        if soup.select("a[href*='editeur.asp']"):
            vol_editor = soup.select("a[href*='editeur.asp']")[0].text
        if debug:
            self.log.info(self.who, "vol_editor processed : ", vol_editor)

        if soup.select("a[href*='collection.asp']"):
            vol_coll = soup.select("a[href*='collection.asp']")[0].text
        if debug: self.log.info(self.who, "vol_coll : ", vol_coll)

        for i in comment_generic.stripped_strings:
            tmp_lst.append(str(i))
        vol_coll_srl = tmp_lst[len(tmp_lst) - 1]
        if "n°" in vol_coll_srl:
            for k in ["n°", "(", ")"]:
                if k in vol_coll_srl:
                    vol_coll_srl = vol_coll_srl.replace(k, "")
            vol_coll_srl = vol_coll_srl.strip()
            vol_coll_srl = vol_coll_srl.split("/")[0]
            if vol_coll_srl[0].isnumeric():
                vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:]
        else:
            vol_coll_srl = ""
        if debug:
            self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl)

        # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead
        # note that I 'calculate' the missing day of the month and even sometimes the missing month
        ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet",
              "août", "septembre", "octobre", "novembre", "décembre")
        for elemnt in soup.select_one(
                "span[class='sousFicheNiourf']").stripped_strings:
            if debug: self.log.info(self.who, "elemnt : ", elemnt)
            if not vol_dp_lgl:
                elemn = (elemnt.replace("Dépôt légal :",
                                        "").split(','))[0].strip()
                if elemn:
                    if elemn.isnumeric() and len(elemn) == 4:
                        vol_dp_lgl = datetime.datetime.strptime(
                            "175 " + elemn, "%j %Y")
                    elif "semestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:]
                            + " " + ele[2], "%j %Y")
                    elif "trimestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:]
                            + " " + ele[2], "%j %Y")
                    else:
                        for i in range(len(ms)):
                            if ms[i] in elemn:
                                ele = elemn.split()
                                vol_dp_lgl = datetime.datetime.strptime(
                                    ("000" + str(10 + 31 * i))[-3:] + " " +
                                    ele[1], "%j %Y")
                                break
                    if debug:
                        self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl)

            if "ISBN" in elemnt:
                vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '')
                if "néant" in vol_isbn: vol_isbn = ""
                if debug:
                    self.log.info(self.who, "vol_isbn processed : ", vol_isbn)

            if "Genre" in elemnt:
                vol_genre = elemnt.lstrip("Genre : ")
                if debug:
                    self.log.info(self.who, "vol_genre processed : ",
                                  vol_genre)

        if soup.select("img[name='couverture']"):
            for elemnt in repr(
                    soup.select("img[name='couverture']")[0]).split('"'):
                if "http" in elemnt:
                    if not vol_cover_index:
                        vol_cover_index = elemnt
                        if debug:
                            self.log.info(self.who,
                                          "vol_cover_index processed : ",
                                          vol_cover_index)

        # add cover image address as a reference in the comment
        if vol_cover_index:
            comment_cover = BS(
                '<div><p>Couverture: <a href="' + vol_cover_index + '">' +
                vol_cover_index + '</a></p></div>', "lxml")

    # select the fields I want... More exist such as film adaptations or references to advises to read
    # but that is not quite consistant around all the books (noosfere is a common database from many people)
    # and beside I have enough info like that AND I do NOT want to take out the noosfere's business

        tmp_comm_lst = soup.select("span[class='AuteurNiourf']")
        if debug: self.log.info(self.who, tmp_comm_lst)  #usefull but too long
        for i in range(len(tmp_comm_lst)):
            if "Quatrième de couverture" in str(tmp_comm_lst[i]):
                comment_resume = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_resume processed")

            if "Critiques" in str(tmp_comm_lst[i]):
                if not "autres" in str(tmp_comm_lst[i]):
                    comment_Critiques = tmp_comm_lst[i].find_parents(
                        "div", {'class': 'sousbloc'})[0]
                    if debug:
                        self.log.info(self.who, "comment_Critiques processed")

            if "Sommaire" in str(tmp_comm_lst[i]):
                comment_Sommaire = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_Sommaire processed")

            if "Critiques des autres" in str(tmp_comm_lst[i]):
                comment_AutresCritique = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]

                if comment_AutresCritique.select('a[href*="serie.asp"]') and (
                        "Critique de la série" in comment_AutresCritique.
                        select('a[href*="serie.asp"]')[0].text):
                    critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select(
                        'a[href*="serie.asp"]')[0]['href']
                    try:
                        more_comment_AutresCritique = self.get_Critique_de_la_serie(
                            critic_url)
                        comment_AutresCritique.append(
                            more_comment_AutresCritique)
                    except:
                        self.log.exception(
                            "get_Critique_de_la_serie failed for url: ",
                            critic_url)

                if debug:
                    self.log.info(self.who, "comment_AutresCritique processed")

    # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-))

        if comment_cover:
            vol_comment_soup.append(comment_cover)
        if comment_generic:
            vol_comment_soup.append(comment_generic)
        if comment_resume:
            vol_comment_soup.append(comment_resume)
        if comment_Critiques:
            vol_comment_soup.append(comment_Critiques)
        if comment_Sommaire:
            vol_comment_soup.append(comment_Sommaire)
        if comment_AutresCritique:
            vol_comment_soup.append(comment_AutresCritique)
        if comment_decoupage_annexe:
            vol_comment_soup.append(
                comment_pre_decoupage_annexe)  # this is the title
            vol_comment_soup.append(comment_decoupage_annexe)

    #
    # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs
    # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning)
    # - I like to have functional url when they exist
    # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) )

        for elemnt in vol_comment_soup.select('[align="justify"]'):
            del elemnt['align']

    # remove all double or triple 'br' to improve presentation.
    # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) )
    #
    # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire...
    # donc vol_comment_soup est modifié...
    #

        tmp1 = tmp2 = "lrp_the_unique"
        for elemnt in vol_comment_soup.findAll():
            tmp1, tmp2 = tmp2, elemnt
            if tmp1 == tmp2:
                elemnt.extract()

        br = soup.new_tag('br')
        for elemnt in vol_comment_soup.select('.AuteurNiourf'):
            elemnt.insert(0, br)
            elemnt["style"] = "font-weight: 600; font-size: 18px"

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet avant correction: ",
                                  elemnt)

        for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/auteur.asp",
                    "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='/heberg/']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/heberg/", "https://www.noosfere.org/heberg/")

        for elemnt in vol_comment_soup.select(
                "a[href*='./EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='heberg']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../../heberg", "https://www.noosfere.org/heberg")
        for elemnt in vol_comment_soup.select("a[href*='../bd']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../bd", "https://www.noosfere.org/bd")

        for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "auteur.asp", "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='collection.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "collection.asp",
                    "https://www.noosfere.org/livres/collection.asp")
        for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "critsign.asp",
                    "https://www.noosfere.org/livres/critsign.asp")
        for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editeur.asp",
                    "https://www.noosfere.org/livres/editeur.asp")
        for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editionslivre.asp",
                    "https://www.noosfere.org/livres/editionslivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "niourf.asp", "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='serie.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "serie.asp", "https://www.noosfere.org/livres/serie.asp")

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet apres correction: ",
                                  elemnt)

        fg, fd = "<<==", "==>>"  #chr(0x21D0),chr(0x21D2)   #chr(0x27f8),chr(0x27f9)
        for elemnt in vol_comment_soup.select("img[src*='arrow_left']"):
            elemnt.replace_with(fg)
        for elemnt in vol_comment_soup.select("img[src*='arrow_right']"):
            elemnt.replace_with(fd)

        # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €)
        # only set vol_coll_srl if vol_coll exists
        # the idea is to use search and replace in the edit Metadata in bulk window.

        if self.extended_publisher:
            if debug:
                self.log.info(
                    self.who,
                    """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set"""
                )
            if vol_coll:
                if debug: self.log.info(self.who, 'add collection')
                vol_editor = vol_editor + ('§') + vol_coll
                if vol_coll_srl:
                    if debug: self.log.info(self.who, 'add collection number')
                    vol_editor = vol_editor + ('€') + vol_coll_srl

        if vol_serie:
            if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq)
            else: vol_serie_seq = 1.0

        # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings
        # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage:
        # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
        # Side note:
        # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) )
        #
        # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien...
        # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je
        # lisais mal et je pensais à une incompatibilité avec la structure xml),
        #
        vol_comment_soup = vol_comment_soup.encode('ascii',
                                                   'xmlcharrefreplace')

        self.log.info(self.who, "+++" * 25)
        self.log.info(self.who,
                      "nsfr_id, type()                : ", self.nsfr_id,
                      type(self.nsfr_id))  # must be <class 'str'>
        self.log.info(self.who,
                      "relevance, type()              : ", self.relevance,
                      type(self.relevance))  # must be <class 'float'>
        self.log.info(self.who, "vol_title, type()              : ", vol_title,
                      type(vol_title))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_auteur, type()             : ", vol_auteur,
            type(vol_auteur))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_prenom, type()      : ", vol_auteur_prenom,
                      type(vol_auteur_prenom))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_nom, type()         : ", vol_auteur_nom,
                      type(vol_auteur_nom))  # must be <class 'str'>
        if vol_serie:
            self.log.info(self.who, "vol_serie, type()              : ",
                          vol_serie, type(vol_serie))  # must be <class 'str'>
            self.log.info(self.who,
                          "vol_serie_seq, type()          : ", vol_serie_seq,
                          type(vol_serie_seq))  # must be <class 'float'>
        self.log.info(self.who, "vol_editor, type()             : ",
                      vol_editor, type(vol_editor))  # must be <class 'str'>
        self.log.info(self.who, "vol_coll, type()               : ", vol_coll,
                      type(vol_coll))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_coll_srl, type()           : ", vol_coll_srl,
                      type(vol_coll_srl))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_dp_lgl, type()             : ", vol_dp_lgl,
            type(vol_dp_lgl)
        )  # must be <class 'datetime.datetime'> ('renderer=isoformat')
        self.log.info(self.who, "vol_isbn, type()               : ", vol_isbn,
                      type(vol_isbn))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_genre, type()              : ", vol_genre,
            type(vol_genre))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who, "vol_cover_index, type()        : ",
                      vol_cover_index, type(vol_cover_index))  # must be
        self.log.info(self.who, "type(vol_comment_soup)         : ",
                      type(vol_comment_soup)
                      )  # must be byte encoded (start with b'blablabla...
        #        self.log.info(self.who,"vol_comment_soup               :\n",vol_comment_soup)                                # Maybe a bit long sometimes
        # language must be <class 'str'>

        if vol_cover_index:
            self.plugin.cache_identifier_to_cover_url(self.nsfr_id,
                                                      vol_cover_index)

        if vol_isbn:
            self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id)

        mi = Metadata(vol_title, [vol_auteur])
        mi.set_identifier('nsfr_id', self.nsfr_id)
        mi.publisher = vol_editor
        mi.isbn = vol_isbn
        mi.tags = [vol_genre]
        mi.source_relevance = self.relevance
        mi.has_cover = bool(vol_cover_index)
        if vol_dp_lgl:
            mi.pubdate = vol_dp_lgl
        if vol_serie:
            mi.series = vol_serie
            mi.series_index = vol_serie_seq
        mi.language = "fra"

        mi.comments = vol_comment_soup

        if debug: self.log.info(self.who, "mi\n", mi, "\n")
        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)