def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id""" issue = pycomicvine.Issue( issue_id, field_list=[ "id", "name", "volume", "issue_number", "person_credits", "description", "store_date", "cover_date", ], ) if not issue or not issue.volume: log.warn("Unable to load Issue(%d)" % issue_id) return None title = "%s #%s" % (issue.volume.name, issue.issue_number) if issue.name: title = title + ": %s" % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier("comicvine", str(issue.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date from calibre.db.write import get_series_values if 'meta' not in extensions: extensions.append('meta') md = create_markdown_object(extensions) html = md.convert(txt) mi = Metadata(title or _('Unknown')) m = md.Meta for k, v in iteritems({'date':'pubdate', 'summary':'comments'}): if v not in m and k in m: m[v] = m.pop(k) for k in 'title authors series tags pubdate comments publisher rating'.split(): val = m.get(k) if val: mf = mi.metadata_for_field(k) if not mf.get('is_multiple'): val = val[0] if k == 'series': val, si = get_series_values(val) mi.series_index = 1 if si is None else si if k == 'rating': try: val = max(0, min(int(float(val)), 10)) except Exception: continue if mf.get('datatype') == 'datetime': try: val = parse_only_date(val, assume_utc=False) except Exception: continue setattr(mi, k, val) return mi, HTML_TEMPLATE % (mi.title, html)
def test(scale=0.5): from PyQt5.Qt import QLabel, QApplication, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout app = QApplication([]) mi = Metadata('xxx', ['Kovid Goyal', 'John Q. Doe', 'Author']) mi.series = 'A series of styles' m = QMainWindow() sa = QScrollArea(m) w = QWidget(m) sa.setWidget(w) l = QGridLayout(w) w.setLayout(l), l.setSpacing(30) labels = [] for r, color in enumerate(sorted(default_color_themes)): for c, style in enumerate(sorted(all_styles())): mi.series_index = c + 1 mi.title = 'An algorithmic cover [%s]' % color prefs = override_prefs(cprefs, override_color_theme=color, override_style=style) for x in ('cover_width', 'cover_height', 'title_font_size', 'subtitle_font_size', 'footer_font_size'): prefs[x] = int(scale * prefs[x]) img = generate_cover(mi, prefs=prefs, as_qimage=True) la = QLabel() la.setPixmap(QPixmap.fromImage(img)) l.addWidget(la, r, c) labels.append(la) m.setCentralWidget(sa) w.resize(w.sizeHint()) m.show() app.exec_()
def test(scale=0.25): from PyQt5.Qt import QLabel, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout from calibre.gui2 import Application app = Application([]) mi = Metadata('Unknown', ['Kovid Goyal', 'John & Doe', 'Author']) mi.series = 'A series & styles' m = QMainWindow() sa = QScrollArea(m) w = QWidget(m) sa.setWidget(w) l = QGridLayout(w) w.setLayout(l), l.setSpacing(30) scale *= w.devicePixelRatioF() labels = [] for r, color in enumerate(sorted(default_color_themes)): for c, style in enumerate(sorted(all_styles())): mi.series_index = c + 1 mi.title = 'An algorithmic cover [%s]' % color prefs = override_prefs(cprefs, override_color_theme=color, override_style=style) scale_cover(prefs, scale) img = generate_cover(mi, prefs=prefs, as_qimage=True) img.setDevicePixelRatio(w.devicePixelRatioF()) la = QLabel() la.setPixmap(QPixmap.fromImage(img)) l.addWidget(la, r, c) labels.append(la) m.setCentralWidget(sa) w.resize(w.sizeHint()) m.show() app.exec_()
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class':'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class':'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def default_mi(self): from calibre.ebooks.metadata.book.base import Metadata mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')]) mi.series = _('A series of samples') mi.series_index = 4 mi.tags = [_('Tag One'), _('Tag Two')] mi.publisher = _('Some publisher') mi.rating = 4 mi.identifiers = {'isbn':'123456789', 'url': 'http://calibre-ebook.com'} mi.languages = ['eng', 'fra'] mi.pubdate = mi.timestamp = now() return mi
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def parse(self, xml_detail): sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None authors = [] tags = [] xpath = self.XPath('//table[@id="record"]//tr') for row in xpath(xml_detail): ch = row.getchildren() txt = ch[0].text.strip() data = self.normalize(ch[1].text) if txt.startswith('245') and title is None: title = self.parse_title(data) if txt.startswith('246'): title = self.parse_title(data) elif txt.startswith('100') or txt.startswith('700'): res = self.parse_author(data) if res is not None: authors.append(res) elif txt == 'SYS': sys_ident = data.strip() elif txt =='020': isbn = self.parse_isbn(data) elif txt == '260': publisher, pub_year = self.parse_publisher(data) elif txt.startswith('490') and serie is None: serie, serie_index = self.parse_serie(data) elif txt == '655 7': tags.append(self.parse_tags(data)) if isbn is not None and isbn != '': cover = self.parse_cover(isbn) if title is not None and len(authors) > 0 and sys_ident is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.identifiers = {self.plugin.name:sys_ident} mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(sys_ident, cover) return mi else: self.log('Data not found') return None
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id.""" issue = PyComicvineWrapper(log).lookup_issue(issue_id) if issue: meta = Metadata(issue.get_full_title(), issue.get_authors()) meta.series = issue.volume_name meta.series_index = issue.issue_number meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume_id)) meta.comments = issue.description meta.has_cover = False meta.publisher = issue.publisher_name meta.pubdate = issue.date return meta else: return None
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.series = serie mi.series_index = serie_index return mi else: return None
def build_meta(log, issue_id): '''Build metadata record based on comicvine issue_id''' issue = pycomicvine.Issue(issue_id, field_list=[ 'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 'store_date', 'cover_date']) if not issue or not issue.volume: log.warn('Unable to load Issue(%d)' % issue_id) return None title = '%s #%s' % (issue.volume.name, issue.issue_number) if issue.name: title = title + ': %s' % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() self.log.info(raw) except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for biblionet timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = json.loads(raw) self.log.info(root) except: msg = 'Failed to parse book detail page: %r' % self.url self.log.exception(msg) return try: self.biblionetid = root['biblionetid'] except: self.log.exception('Error parsing book id for url: %r' % self.url) self.biblionetid = None try: self.title = root['title'].strip() except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = [root['authors'].strip()] self.log.info(self.authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None try: self.cover_url = root['cover_url'] self.log.info('Parsed URL for cover:%r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.biblionetid, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) try: self.publisher = root['publisher'] self.log.info('Parsed publisher:%s' % self.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: self.tags = root['categories'].replace('DDC: ', 'DDC:').replace( '-', '').split()[:-1] self.log.info('Parsed tags:%s' % self.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.pubdate = root['yr_published'] self.log.info('Parsed publication date:%s' % self.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) mi = Metadata(self.title, self.authors) mi.set_identifier('biblionet', self.biblionetid) if self.series_index: try: mi.series_index = float(self.series_index) except: self.log.exception('Error loading series') if self.relevance: try: mi.source_relevance = self.relevance except: self.log.exception('Error loading relevance') if self.cover_url: try: mi.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') if self.publisher: try: mi.publisher = self.publisher except: self.log.exception('Error loading publisher') if self.tags: try: mi.tags = self.tags except: self.log.exception('Error loading tags') if self.pubdate: try: if self.pubdate not in (self.yr_msg1, self.yr_msg2): d = datetime.date(int(self.pubdate), 1, 1) mi.pubdate = d except: self.log.exception('Error loading pubdate') self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def get_metadata_(src, encoding=None): # Meta data definitions as in # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi
def parse_details(self, root): try: goodreads_id = self.parse_goodreads_id(self.url) except: self.log.exception("Error parsing goodreads id for url: %r" % self.url) goodreads_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception("Error parsing title and series for url: %r" % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception("Error parsing authors for url: %r" % self.url) authors = [] if not title or not authors or not goodreads_id: self.log.error("Could not find title/authors/goodreads id for %r" % self.url) self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier("goodreads", goodreads_id) self.goodreads_id = goodreads_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception("Error parsing ISBN for url: %r" % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception("Error parsing ratings for url: %r" % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception("Error parsing comments for url: %r" % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception("Error parsing cover for url: %r" % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception("Error parsing tags for url: %r" % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception("Error parsing publisher and date for url: %r" % self.url) mi.source_relevance = self.relevance if self.goodreads_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
#!/usr/bin/env python
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or _('Unknown') # Author authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages', ): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments', ): val = get(field) if val: setattr( mi, field, val.replace('&', '&').replace('<', '<').replace( '>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k, v) in iteritems(meta_tag_ids): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def parse_response(cls, response, log): metadata_items = [] tags = [] series = u'' title = u'' translators = [] series_index = 0 authors = [] resp = urllib2.urlopen(response) page = html.parse(resp) e = page.getroot().find_class('_ga1_on_').pop() e.find("noindex").drop_tree() for i in e.xpath('//ol/li/a/text()'): tags.append(unicode(i)) for i in e.xpath(u"//div[@class='_ga1_on_']/br[position()=1]/preceding-sibling::a[contains(@href,'/a/')]/text()"): authors.append(unicode(i)) for i in e.xpath(u"//div[@class='_ga1_on_']/br[position()=1]/preceding-sibling::a[preceding::text()[contains(.,'перевод:')]]/text()"): translators.append(unicode(i)) for i in e.xpath("(//div[@class='_ga1_on_']/div[@id='z0']/following-sibling::text())[1]"): title += i for i in e.xpath("//div[@class='_ga1_on_']/h8"): series = i.text_content() series_index = re.findall('\d+', i.tail)[0] for i in e.xpath("./a[contains(@href,'/s/')]/text()"): tags.append(unicode(i)) for i in e.xpath("//div[@class='genre']/a/@href"): tags.append(unicode(i.split('/')[-1])) log.info(u'Found %s/%s: %s' % (series, series_index, title)) if tags and series in tags: tags.remove(series) if translators: for t in translators: if t in authors: authors.remove(t) metadata_item = Metadata(title, authors) if tags: metadata_item.tags = tags if series != '': metadata_item.series = series if series_index is not 0: metadata_item.series_index = series_index metadata_items.append(metadata_item) log.info(series, metadata_item.series) if u'Игрушечный дом' == series: log.info('1') if u'Игрушечный дом' == metadata_item.series: log.info('2') if series == metadata_item.series: log.info('3') return metadata_items
def parse_details(self, root): try: antik_id = self.parse_antik_id(root) self.log.info('Parsed Antikvarium identifier: %s' % antik_id) except: self.log.exception('Error parsing Antikvarium id for url: %r' % self.url) antik_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s' % title) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s' % authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not antik_id: self.log.error( 'Could not find title/authors/Antikvarium.hu id for %r' % self.url) self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' % (antik_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('antik_hu', antik_id) self.antik_id = antik_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s' % isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: series = self.parse_series(root) self.log.info('Parsed series: %s' % series) except: self.log.exception('Error parsing series for url: %r' % self.url) series = None try: mi.series_index = self.parse_series_index(root) self.log.info('Parsed series index: %s' % mi.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) mi.series_index = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s' % mi.comments) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.antik_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s' % mi.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s' % mi.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s' % mi.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) try: mi.languages = self.parse_languages(root) self.log.info('Parsed languages: %r' % mi.languages) except: self.log.exception('Error parsing languages for url: %r' % self.url) mi.source_relevance = self.relevance if series: mi.series = series if self.antik_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: legie_id = self.parse_legie_id(self.url) except: self.log.exception('Error parsing Legie id for url: %r' % self.url) legie_id = None try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not legie_id: self.log.error('Could not find title/authors/Legie id for %r' % self.url) self.log.error('Legie: %r Title: %r Authors: %r' % (legie_id, title, authors)) return self.legie_id = legie_id rating = comments = series = series_index = None try: rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: (series, series_index) = self.parse_series(root) except: self.log.info('Series not found.') try: tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) tags = None if legie_id: editions = self.get_editions() if editions: num_editions = len(editions) self.log.info('Nalezeno %d vydani' % num_editions) for edition in editions: (year, cover_url, publisher, isbn) = edition mi = Metadata(title, authors) self.legie_id = "%s#%s" % (legie_id, year) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index if cover_url: mi.cover_url = self.cover_url = cover_url self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) else: mi = Metadata(title, authors) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) if self.legie_id: if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url)
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def _GoodreadsBook_to_Metadata(self, book): # type: (_GoodreadsBook) -> Metadata """ :param book: _GoodreadsBook: book :return: Metadata: Metadata """ mi = Metadata(book.title, book.authors) mi.source_relevance = 0 mi.set_identifier('goodreads', book.id) if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get( 'isbn'): mi.set_identifier('isbn', '') if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']: mi.set_identifier('amazon', book.asin) if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']: try: if len(book.isbn) == 10: mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn)) else: mi.isbn = check_isbn13(book.isbn) except: self.log.error("ISBN CONVERSION ERROR:", book.isbn) self.log.exception() if book.image_url: self.log.info('cache_identifier_to_cover_url:', book.asin, ':', book.image_url) self.cache_identifier_to_cover_url(book.id, book.image_url) if book.publisher: self.log.info('book.publisher is:', book.publisher) mi.publisher = book.publisher if book.pubdate: self.log.info('book.pubdate is:', book.pubdate.strftime('%Y-%m-%d')) mi.pubdate = book.pubdate if book.comments: self.log.info('book.editorial_review is:', book.comments) mi.comments = book.comments tags = self.prefs['ADD_THESE_TAGS'].split(',') tags.extend(book.tags) # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings'] # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags)))) if book.series: mi.series = book.series self.log.info(u'series:', book.series) if book.series_index: mi.series_index = book.series_index self.log.info(u'series_index:', "{0:.2f}".format(book.series_index)) else: mi.series_index = 0 if book.average_rating: mi.rating = book.average_rating self.clean_downloaded_metadata(mi) return mi
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the title of the book try: title_node = root.xpath('//span[@itemprop="name"]') self.title = title_node[0].text except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath('//span[@class="expandAuthorName"]') author_strings = author_node[0].text.split(",") #print(author_strings) for name in author_strings: self.authors.append(name) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Get the series of the book try: series_node = root.xpath('//b[contains(text(), "Serie")]/a') if len(series_node) > 0: self.series = series_node[0].text.split(": ")[0].strip() self.series_index = series_node[0].text.split(": ")[-1].strip() # print("'%s'" % self.series) # print("'%s'" % self.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) # Some books have ratings, let's use them. try: self.rating = 0.0 except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: isbn_node = root.xpath( '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]' ) if len(isbn_node) > 0: self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip() except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: comment_node = root.xpath('//meta[@name="description"]/@content') self.comments = comment_node[0] except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: cover_node = root.xpath( '//div[@class="bookDetailCoverCover"]/img/@src') self.cover_url = "https://mofibo.com" + cover_node[0] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: publisher_node = root.xpath( '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]') if len(publisher_node) > 0: self.publisher = publisher_node[0].text except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language_node = root.xpath('//b[@class="expanderLanguage"]') language = language_node[0].text.strip().replace("Sprog:", "").replace( " ", "") language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: pubdate_node = root.xpath( '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]' ) if len(pubdate_node) > 0: date_str = pubdate_node[0].text.replace("Udgivet:", "").strip() format_str = '%Y-%m-%d' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Get the tags try: tags = [] tags_node = root.xpath('//span[@itemprop="category"]') tags.append(tags_node[0].text.strip()) self.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('mofibo', self.url) # Set rating if self.series: try: meta_data.series = self.series meta_data.series_index = self.series_index except: self.log.exception('Error loading series') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Set tags data if self.tags: try: meta_data.tags = self.tags except: self.log.exception('Error loading tags') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30): self.load_config() if authors is None: authors=[] # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ] for i in ignored_authors: authors = [ x for x in authors if x != i ] if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).") return None queries=[] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: exact_search['idn'] = idn # in case look for a IDN only search for the IDN and skip all the other stuff queries.append('num='+idn) else: authors_v = [] title_v = [] # create some variants of given authors if authors != []: authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False))) # concat all author names ("Peter Meier Luise Stark") authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True))) # use only first author for a in authors: authors_v.append(a) # use all authors, one by one # remove duplicates unique_authors_v = [] for i in authors_v: if i not in unique_authors_v: unique_authors_v.append(i) # create some variants of given title if title is not None: title_v.append(title) # simply use given title title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False))) # remove some punctation characters title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True))) # remove subtitle (everything after " : ") title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False))) # remove some punctation characters and joiners ("and", "&", ...) title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True))) # remove subtitle (everything after " : ") and joiners ("and", "&", ...) # TODO: remove subtitle after " - " # remove duplicates unique_title_v = [] for i in title_v: if i not in unique_title_v: unique_title_v.append(i) # title and author if authors_v != [] and title_v != []: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with first author as title and title (without subtitle) as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # try with author and title (without subtitle) in any index if isbn is not None: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # author but no title elif authors_v != [] and title_v == []: for i in authors_v: if isbn is not None: queries.append('per="'+ i +'" AND num="' + isbn + '"') else: queries.append('per="'+ i +'"') # try with author as title if isbn is not None: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"') else: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # title but no author elif authors_v == [] and title_v != []: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with title as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # remove duplicate queries uniqueQueries=[] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: # SRU does not work with "+" or "?" characters in query, so we simply remove them query = re.sub('[\+\?]','',query) query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' } for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None authors = [] author_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] publisher_name = None publisher_location = None ##### Field 264 ##### # Publisher Name and Location fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); # Publishing Date for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0) break # Log if publisher_name is not None: log.info("Extracted Publisher: %s" % publisher_name) if publisher_location is not None: log.info("Extracted Publisher Location: %s" % publisher_location) if pubdate is not None: log.info("Extracted Publication Year: %s" % pubdate) ##### Field 245 #### # Title/Series/Series_Index title_parts = [] for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # if a,n,p,n,p,n,p exist: series = a + n0 + " - " + p0 + n1 + " - " + p1, series_index = n2, title = p2 # if a,n,p,n,p exist: series = a + n0 + " - " + p0, series_index = n1, title = p1 (Example: dnb-id 1008774839) # if a,n,p exist: series = a, series_index = n, title = p # if a exist: title = a # TODO: a,n,p,n (i.e. 956375146) code_p = [] code_n = [] code_a = [] for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns): code_p.append(j.text.strip()) for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns): match = re.search("(\d+[,\.\d+]?)", j.text.strip()) if match: code_n.append(match.group(1)) else: code_n.append("0") # looks like sometimes DNB does not know the series index and uses something like "[...]" for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns): code_a.append(j.text.strip()) if len(code_p) == 0: title_parts = title_parts + code_a elif len(code_p)>0 and len(code_p) == len(code_n): series = " : ".join(code_a) # I've never seen more than one code_a, but who knows... for i in range (0,len(code_p)-1): series = series + " " + code_n[i] + " " + code_p[i] series_index = code_n[-1] title_parts.append(code_p[-1]) # subtitle 1: Field 245 for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) # Log if series_index is not None: log.info("Extracted Series_Index from Field 245: %s" % series_index) if series is not None: log.info("Extracted Series from Field 245: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if title is not None: log.info("Extracted Title: %s" % title) title = self.cleanUpTitle(log, title) # Title_Sort if len(title_parts)>0: title_sort_parts = list(title_parts) title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword])) title_sort = " : ".join(title_sort_parts) # Log if title_sort is not None: log.info("Extracted Title_Sort: %s" % title_sort) ##### Field 100 and Field 700 ##### # Authors for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # primary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)==0: # if no "real" autor was found take all persons involved for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)>0: author_sort = authors[0] # Log if len(authors)>0: log.info("Extracted Authors: %s" % " & ".join(authors)) if author_sort is not None: log.info("Extracted Author_Sort: %s" % " & ".join(authors)) ##### Field 856 ##### # Comments for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) # Log if comments is not None: log.info('Comments: %s' % comments) # If no comments are found for this edition, look at other editions of this book (Fields 776) # TODO: Make this configurable (default: yes) if comments is None: # get all other issues for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns): other_idn = re.sub("^\(.*\)","",i.text.strip()); subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(subquery) if self.cfg_dnb_token is None: subresults = self.getSearchResultsByScraping(log, subquery, timeout) else: subresults = self.getSearchResults(log, subquery, timeout) if subresults is None: continue for subrecord in subresults: for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) if comments is not None: log.info('Comments from other issue: %s' % comments) break ##### Field 16 ##### # ID: IDN for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): idn = i.text.strip() break # Log if idn is not None: log.info("Extracted ID IDN: %s" % idn) ##### Field 24 ##### # ID: URN for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): urn = i.text.strip() break # Log if urn is not None: log.info("Extracted ID URN: %s" % urn) ##### Field 20 ##### # ID: ISBN for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-','') break # Log if isbn is not None: log.info("Extracted ID ISBN: %s" % isbn) # When doing an exact search for a given ISBN skip books with wrong ISBNs if isbn is not None and "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info("Extracted ISBN does not match book's ISBN, skipping record") continue ##### Field 82 ##### # ID: Sachgruppe (DDC) for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): ddc.append(i.text.strip()) # Log if len(ddc)>0: log.info("Extracted ID DDC: %s" % ",".join(ddc)) ##### Field 490 ##### # In theory this field is not used for "real" book series, use field 830 instead. But it is used. # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() parts = re.split(" : ",attr_v) if len(parts)==2: if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.search("(\d+[,\.\d+]?)", indexpart) if match is not None: series_index = match.group(1) series = textpart.strip() else: match = re.search("(\d+[,\.\d+]?)", attr_v) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Use Series Name from attribute "a" if not already found in attribute "v" if series is None: series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 490: %s" % series_index) if series is not None: log.info("Extracted Series from Field 490: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 246 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip()) if match is not None: series = match.group(1) series_index = match.group(2) # Log if series_index is not None: log.info("Extracted Series Index from Field 246: %s" % series_index) if series is not None: log.info("Extracted Series from Field 246: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 800 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 800: %s" % series_index) if series is not None: log.info("Extracted Series from Field 800: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 830 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 830: %s" % series_index) if series is not None: log.info("Extracted Series from Field 830: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 689 ##### # GND Subjects for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): subjects_gnd.append(i.text.strip()) for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) # Log if len(subjects_gnd)>0: log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) ##### Fields 600-655 ##### # Non-GND subjects for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;',i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i)<2: subjects_non_gnd.remove(i) # Log if len(subjects_non_gnd)>0: log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) ##### Field 250 ##### # Edition for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): edition = i.text.strip() break # Log if edition is not None: log.info("Extracted Edition: %s" % edition) ##### Field 41 ##### # Languages for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): languages.append(i.text.strip()) # Log if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) ##### If configured: Try to separate Series, Series Index and Title from the fetched title ##### #if self.cfg_guess_series is True: if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True: guessed_series = None guessed_series_index = None guessed_title = None log.info("Starting Series Guesser") parts = re.split("[:]",self.removeSortingCharacters(title)) if len(parts)==2: log.info("Title has two parts") # make sure only one part of the two parts contains digits if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): log.info("only one title part contains digits") # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] # Look at the part without digits: match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # Look at the part with digits: # for Titleparts like: "Name of the series - Episode 2" match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO1: guessed_title: " + guessed_title) #log.info("ALGO1: guessed_series: " + guessed_series) #log.info("ALGO1: guessed_series_index: " + guessed_series_index) else: # for Titleparts like: "Episode 2 Name of the series" match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: # sometimes books with multiple volumes are detected as series without name -> Add the volume to the title guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO2: guessed_title: " + guessed_title) #log.info("ALGO2: guessed_series: " + guessed_series) #log.info("ALGO2: guessed_series_index: " + guessed_series_index) else: # for titleparts like: "Band 2" match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) # ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE # some false positives match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart) if match: guessed_series = match.group(1) guessed_title = match.group(2) log.info("ALGO3: guessed_title: " + guessed_title) log.info("ALGO3: guessed_series: " + guessed_series) log.info("ALGO3: guessed_series_index: " + guessed_series_index) elif len(parts)==1: log.info("Title has one part") # for Titles like: "Name of the series - Title (Episode 2)" match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) guessed_title = match.group(2) #log.info("ALGO4: guessed_title: " + guessed_title) #log.info("ALGO4: guessed_series: " + guessed_series) #log.info("ALGO4: guessed_series_index: " + guessed_series_index) else: # for Titles like: "Name of the series - Episode 2" match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) guessed_title = guessed_series + " : Band " + guessed_series_index #log.info("ALGO5: guessed_title: " + guessed_title) #log.info("ALGO5: guessed_series: " + guessed_series) #log.info("ALGO5: guessed_series_index: " + guessed_series_index) # Log if guessed_series is not None: log.info("Guessed Series: %s" % guessed_series) #guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name) if guessed_series_index is not None: log.info("Guessed Series Index: %s" % guessed_series_index) if guessed_title is not None: log.info("Guessed Title: %s" % guessed_title) guessed_title = self.cleanUpTitle(log, guessed_title) if guessed_series is not None and guessed_series_index is not None and guessed_title is not None: title = guessed_title series = guessed_series series_index = guessed_series_index ##### Filter exact searches ##### # When doing an exact search for a given IDN skip books with wrong IDNs # TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions if idn is not None and "idn" in exact_search: if idn != exact_search["idn"]: log.info("Extracted IDN does not match book's IDN, skipping record") continue ##### Put it all together ##### if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn',urn) mi.set_identifier('dnb-idn',idn) mi.set_identifier('ddc', ",".join(ddc)) # cfg_subjects: # 0: use only subjects_gnd if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) # 1: use only subjects_gnd if found, else subjects_non_gnd elif self.cfg_fetch_subjects == 1: if len(subjects_gnd)>0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) # 2: subjects_gnd and subjects_non_gnd elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) # 3: use only subjects_non_gnd if found, else subjects_gnd elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd)>0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) # 4: use only subjects_non_gnd elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) # 5: use no subjects at all elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: \n%s" % mi) result_queue.put(mi)
def parse_details(self, root): try: moly_id = self.parse_moly_id(self.url) self.log.info('Parsed moly.hu identifier: %s' % moly_id) except: self.log.exception( 'Error parsing moly.hu id for url: %r' % self.url) moly_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s' % title) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s' % authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not moly_id: self.log.error( 'Could not find title/authors/moly.hu id for %r' % self.url) self.log.error('Moly.hu id: %r Title: %r Authors: %r' % (moly_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('moly_hu', moly_id) self.moly_id = moly_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s' % isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: series_info = self.parse_series(root) if series_info is not None: mi.series = series_info[0] mi.series_index = int(series_info[1]) self.log.info('Parsed series: %s, series index: %f' % (mi.series, mi.series_index)) except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s' % mi.comments) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_covers(root) self.log.info('Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url( self.moly_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s' % mi.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.languages = self.parse_languages(mi.tags) self.log.info('Parsed languages: %r' % mi.languages) except: self.log.exception('Error parsing language for url: %r' % self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s' % mi.publisher) except: self.log.exception( 'Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s' % mi.pubdate) except: self.log.exception( 'Error parsing published date for url: %r' % self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating: %s\n\n' % mi.rating) except: self.log.exception('Error parsing tags for url: %r\n\n' % self.url) mi.source_relevance = self.relevance if self.moly_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # We should not even be here if we are not processing an ebook hit if self.url.find("/ebook/") == -1: return try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Beam Ebooks timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # raw = raw.decode('utf-8', errors='replace') raw = raw.decode('iso-8859-1', errors='replace') # open('D:\\work\\calibre-dump-book-details.html', 'wb').write(raw) if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = fromstring(raw) except: msg = 'Failed to parse beam ebooks details page: %r' % self.url self.log.exception(msg) return try: self.beam_ebooks_id = self.parse_beam_ebooks_id(self.url) except: self.log.exception('Error parsing beam ebooks id for url: %r' % self.url) self.beam_ebooks_id = None try: (self.title, self.series_index) = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None mi = Metadata(self.title, self.authors) mi.set_identifier('beam-ebooks', self.beam_ebooks_id) if self.series_index: mi.series_index = float(self.series_index) self._determine_perry_rhodan_cycle_name(mi) mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) print(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [ _.get('content') for _ in node if _.get('property') == property ][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath( '//meta[starts-with(@property, "og") or starts-with(@property, "books")]' ) # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath( '//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([ _ + u'(역자)' for _ in _format_list(book_info['translator']['name']) ]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url( ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans
def set_mi(self, mi, fm): ''' This sets the metadata for the test result books table. It doesn't reset the contents of the field selectors for editing rules. ''' self.fm = fm if mi: if not isinstance(mi, list): mi = (mi, ) else: mi = Metadata(_('Title'), [_('Author')]) mi.author_sort = _('Author Sort') mi.series = ngettext('Series', 'Series', 1) mi.series_index = 3 mi.rating = 4.0 mi.tags = [_('Tag 1'), _('Tag 2')] mi.languages = ['eng'] mi.id = 1 if self.fm is not None: mi.set_all_user_metadata(self.fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui fm = get_gui().current_db.new_api.field_metadata mi.set_all_user_metadata(fm.custom_field_metadata()) for col in mi.get_all_user_metadata(False): if fm[col]['datatype'] == 'datetime': mi.set(col, DEFAULT_DATE) elif fm[col]['datatype'] in ('int', 'float', 'rating'): mi.set(col, 2) elif fm[col]['datatype'] == 'bool': mi.set(col, False) elif fm[col]['is_multiple']: mi.set(col, (col, )) else: mi.set(col, col, 1) mi = (mi, ) self.mi = mi tv = self.template_value tv.setColumnCount(2) tv.setHorizontalHeaderLabels((_('Book title'), _('Template value'))) tv.horizontalHeader().setStretchLastSection(True) tv.horizontalHeader().sectionResized.connect(self.table_column_resized) tv.setRowCount(len(mi)) # Set the height of the table h = tv.rowHeight(0) * min(len(mi), 5) h += 2 * tv.frameWidth() + tv.horizontalHeader().height() tv.setMinimumHeight(h) tv.setMaximumHeight(h) # Set the size of the title column if self.table_column_widths: tv.setColumnWidth(0, self.table_column_widths[0]) else: tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10) tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) tv.setRowCount(len(mi)) # Use our own widget to get rid of elision. setTextElideMode() doesn't work for r in range(0, len(mi)): w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 0, w) w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 1, w) self.display_values('')
def parse_book_page(self, url): # TODO: Support for login-based rating fetching # TODO: Move all parsing logic to methods in order to avoid dangling variables # TODO: Saving metadata in custom columns # TODO: Configurable embedding metadata in comment # TODO: missing items # original language, first polish publish date, publisher serie, form self.log.info('INFO: Downloading book page: {}'.format(url)) root_tag = self.get_lxml_root(url) if not root_tag: return None book_tag = self.get_book_tag(root_tag) if self.prefs['title']: book_title = self.parse_title(root_tag, book_tag, url) else: book_title = self.title if self.prefs['authors']: book_authors = self.parse_authors(root_tag, book_tag, url) else: book_authors = self.authors mi = Metadata(book_title, book_authors) additional_meta = {} if self.enabled('languages'): languages = self.parse_languages(root_tag, book_tag, url) if languages: mi.languages = languages if self.enabled('rating'): rating = self.parse_rating(root_tag, book_tag, url) if rating != None: mi.rating = rating if self.enabled('tags'): tags = self.parse_tags(root_tag, book_tag, url) if tags: mi.tags = tags if self.enabled('identifier'): identifier = self.parse_identifier(root_tag, book_tag, url) if identifier: mi.set_identifier(IDENTIFIER, identifier) if self.enabled('pubdate'): pubdate = self.parse_pubdate(root_tag, book_tag, url) if pubdate: mi.pubdate = pubdate if self.enabled('covers'): covers = self.parse_covers(root_tag, book_tag, url) if covers: mi.has_cover = True self.plugin.cached_identifier_to_cover_url('urls').extend( covers) else: self.plugin.cache_identifier_to_cover_url('nocover', True) # TODO: is this necessary? if self.enabled('series'): series = self.parse_series(root_tag, book_tag, url) if series: additional_meta['series'] = [ self.get_series_string(name, index) for name, index in series ] name, index = series[0] mi.series = name if index is not None: mi.series_index = index if self.enabled('translators'): translators = self.parse_translators(root_tag, book_tag, url) if translators: additional_meta['translators'] = translators if self.enabled('original_title'): original_title = self.parse_original_title(root_tag, book_tag, url) if original_title: additional_meta['original_title'] = original_title if self.enabled('categories'): categories = self.parse_categories(root_tag, book_tag, url) if categories: additional_meta['categories'] = categories if self.enabled('genres'): genres = self.parse_genres(root_tag, book_tag, url) if genres: additional_meta['genres'] = genres if self.enabled('comments'): comments = self.parse_comments(root_tag, book_tag, url) or '' additional_comments = self.format_additional_comment( additional_meta) if comments or additional_comments: mi.comments = comments + additional_comments self.log.info('INFO: Parsing book page completed') return mi
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings) / len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool( [r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg) / len(avg) ans.average_source_relevance = avg return ans
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r' % self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r' % self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r' % (kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r' % self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def __init__(self, parent, text, mi=None, fm=None, color_field=None, icon_field_key=None, icon_rule_kind=None, doing_emblem=False, text_is_placeholder=False, dialog_is_st_editor=False, global_vars=None, all_functions=None, builtin_functions=None): QDialog.__init__(self, parent) Ui_TemplateDialog.__init__(self) self.setupUi(self) self.coloring = color_field is not None self.iconing = icon_field_key is not None self.embleming = doing_emblem self.dialog_is_st_editor = dialog_is_st_editor if global_vars is None: self.global_vars = {} else: self.global_vars = global_vars cols = [] if fm is not None: for key in sorted( displayable_columns(fm), key=lambda k: sort_key(fm[k]['name'] if k != color_row_key else 0)): if key == color_row_key and not self.coloring: continue from calibre.gui2.preferences.coloring import all_columns_string name = all_columns_string if key == color_row_key else fm[key][ 'name'] if name: cols.append((name, key)) self.color_layout.setVisible(False) self.icon_layout.setVisible(False) if self.coloring: self.color_layout.setVisible(True) for n1, k1 in cols: self.colored_field.addItem( n1 + (' (' + k1 + ')' if k1 != color_row_key else ''), k1) self.colored_field.setCurrentIndex( self.colored_field.findData(color_field)) elif self.iconing or self.embleming: self.icon_layout.setVisible(True) if self.embleming: self.icon_kind_label.setVisible(False) self.icon_kind.setVisible(False) self.icon_chooser_label.setVisible(False) self.icon_field.setVisible(False) for n1, k1 in cols: self.icon_field.addItem('{} ({})'.format(n1, k1), k1) self.icon_file_names = [] d = os.path.join(config_dir, 'cc_icons') if os.path.exists(d): for icon_file in os.listdir(d): icon_file = icu_lower(icon_file) if os.path.exists(os.path.join(d, icon_file)): if icon_file.endswith('.png'): self.icon_file_names.append(icon_file) self.icon_file_names.sort(key=sort_key) self.update_filename_box() if self.iconing: dex = 0 from calibre.gui2.preferences.coloring import icon_rule_kinds for i, tup in enumerate(icon_rule_kinds): txt, val = tup self.icon_kind.addItem(txt, userData=(val)) if val == icon_rule_kind: dex = i self.icon_kind.setCurrentIndex(dex) self.icon_field.setCurrentIndex( self.icon_field.findData(icon_field_key)) if dialog_is_st_editor: self.buttonBox.setVisible(False) else: self.new_doc_label.setVisible(False) self.new_doc.setVisible(False) self.template_name_label.setVisible(False) self.template_name.setVisible(False) if mi: if not isinstance(mi, list): mi = (mi, ) else: mi = Metadata(_('Title'), [_('Author')]) mi.author_sort = _('Author Sort') mi.series = ngettext('Series', 'Series', 1) mi.series_index = 3 mi.rating = 4.0 mi.tags = [_('Tag 1'), _('Tag 2')] mi.languages = ['eng'] mi.id = 1 if fm is not None: mi.set_all_user_metadata(fm.custom_field_metadata()) else: # No field metadata. Grab a copy from the current library so # that we can validate any custom column names. The values for # the columns will all be empty, which in some very unusual # cases might cause formatter errors. We can live with that. from calibre.gui2.ui import get_gui mi.set_all_user_metadata(get_gui( ).current_db.new_api.field_metadata.custom_field_metadata()) for col in mi.get_all_user_metadata(False): mi.set(col, (col, ), 0) mi = (mi, ) self.mi = mi # Set up the display table self.table_column_widths = None try: self.table_column_widths = \ gprefs.get('template_editor_table_widths', None) except: pass tv = self.template_value tv.setRowCount(len(mi)) tv.setColumnCount(2) tv.setHorizontalHeaderLabels((_('Book title'), _('Template value'))) tv.horizontalHeader().setStretchLastSection(True) tv.horizontalHeader().sectionResized.connect(self.table_column_resized) # Set the height of the table h = tv.rowHeight(0) * min(len(mi), 5) h += 2 * tv.frameWidth() + tv.horizontalHeader().height() tv.setMinimumHeight(h) tv.setMaximumHeight(h) # Set the size of the title column if self.table_column_widths: tv.setColumnWidth(0, self.table_column_widths[0]) else: tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10) # Use our own widget to get rid of elision. setTextElideMode() doesn't work for r in range(0, len(mi)): w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 0, w) w = QLineEdit(tv) w.setReadOnly(True) tv.setCellWidget(r, 1, w) tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) # Remove help icon on title bar icon = self.windowIcon() self.setWindowFlags(self.windowFlags() & (~Qt.WindowType.WindowContextHelpButtonHint)) self.setWindowIcon(icon) self.all_functions = all_functions if all_functions else formatter_functions( ).get_functions() self.builtins = (builtin_functions if builtin_functions else formatter_functions().get_builtins_and_aliases()) self.last_text = '' self.highlighter = TemplateHighlighter(self.textbox.document(), builtin_functions=self.builtins) self.textbox.cursorPositionChanged.connect(self.text_cursor_changed) self.textbox.textChanged.connect(self.textbox_changed) self.textbox.setFont(self.get_current_font()) self.textbox.setTabStopWidth(10) self.source_code.setTabStopWidth(10) self.documentation.setReadOnly(True) self.source_code.setReadOnly(True) if text is not None: if text_is_placeholder: self.textbox.setPlaceholderText(text) self.textbox.clear() text = '' else: self.textbox.setPlainText(text) else: text = '' self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText( _('&OK')) self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText( _('&Cancel')) self.color_copy_button.clicked.connect(self.color_to_clipboard) self.filename_button.clicked.connect(self.filename_button_clicked) self.icon_copy_button.clicked.connect(self.icon_to_clipboard) try: with open(P('template-functions.json'), 'rb') as f: self.builtin_source_dict = json.load(f, encoding='utf-8') except: self.builtin_source_dict = {} func_names = sorted(self.all_functions) self.function.clear() self.function.addItem('') for f in func_names: self.function.addItem( '{} -- {}'.format( f, self.function_type_string(f, longform=False)), f) self.function.setCurrentIndex(0) self.function.currentIndexChanged.connect(self.function_changed) self.display_values(text) self.rule = (None, '') tt = _('Template language tutorial') self.template_tutorial.setText( '<a href="%s">%s</a>' % (localize_user_manual_link( 'https://manual.calibre-ebook.com/template_lang.html'), tt)) tt = _('Template function reference') self.template_func_reference.setText( '<a href="%s">%s</a>' % (localize_user_manual_link( 'https://manual.calibre-ebook.com/generated/en/template_ref.html' ), tt)) s = gprefs.get('template_editor_break_on_print', False) self.go_button.setEnabled(s) self.remove_all_button.setEnabled(s) self.set_all_button.setEnabled(s) self.toggle_button.setEnabled(s) self.breakpoint_line_box.setEnabled(s) self.breakpoint_line_box_label.setEnabled(s) self.break_box.setChecked(s) self.break_box.stateChanged.connect(self.break_box_changed) self.go_button.clicked.connect(self.go_button_pressed) self.textbox.setFocus() self.set_up_font_boxes() self.toggle_button.clicked.connect(self.toggle_button_pressed) self.remove_all_button.clicked.connect(self.remove_all_button_pressed) self.set_all_button.clicked.connect(self.set_all_button_pressed) self.load_button.clicked.connect(self.load_template) self.save_button.clicked.connect(self.save_template) # Now geometry try: geom = gprefs.get('template_editor_dialog_geometry', None) if geom is not None: QApplication.instance().safe_restore_geometry( self, QByteArray(geom)) except Exception: pass
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi
def extract_vol_details(self, vol_url): # Here we extract and format the information from the choosen volume. # - The first name and last name to populate author and author sort : vol_auteur_prenom and vol_auteur_nom # - The title of the volume : vol_title # - The serie name the volume is part of : vol_serie # - The sequence number in the serie : vol_serie_seq # missing # - The editor of this volume : vol_editor # - The editor's collection of this volume : vol_coll # - The collection serial code of this volume : vol_coll_srl # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl # date format to be computed # - The ISBN number assoi-ciated with the volume : vol_isbn # - The volume tags : vol_genre # - The url pointer to the volume cover image : vol_cover_index # - The comments includes various info about the book : vol_comment_soup # . reference, an url pointer to noosfere # . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume # . first edition information # . serie (cycle) name and number # . this volume editor info # . Resume (quatrième de couverture) # . Critiques # . Sommaire detailing what novels are in the volume when it is an anthology # . Critiques about the serie and/or about another volume of the book # debug = self.dbg_lvl & 2 self.log.info(self.who, "\nIn extract_vol_details(soup)") if debug: self.log.info(self.who, "vol_url : ", vol_url) if debug: self.log.info( self.who, "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')" ) self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who) rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who) soup = rsp[0] url_vrai = rsp[1].replace("&Tri=3", "") # if debug: self.log.info(self.who,soup.prettify()) # useful but too big... self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace( '?', '&').replace('=', '&').split('&')[2] # self.nsfr_id = (self.nfsr_id).strip("$") # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ??? tmp = self.nsfr_id self.nsfr_id = tmp.strip('$') if debug: self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) tmp_lst = [] vol_info = {} vol_title = "" vol_auteur = "" vol_auteur_prenom = "" vol_auteur_nom = "" vol_serie = "" vol_serie_seq = "" vol_editor = "" vol_coll = "" vol_coll_srl = "" vol_dp_lgl = "" vol_isbn = "" vol_genre = "" vol_cover_index = "" comment_generic = None comment_resume = None comment_Critiques = None comment_Sommaire = None comment_AutresCritique = None comment_cover = None comment_decoupage_annexe = None # add volume address as a reference in the comment vol_comment_soup = BS( '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai + '</a></p></div>', "lxml") if debug: self.log.info(self.who, "vol reference processed") if soup.select("span[class='TitreNiourf']"): vol_title = soup.select( "span[class='TitreNiourf']")[0].text.strip() if debug: self.log.info(self.who, "vol_title processed : ", vol_title) if soup.select("span[class='AuteurNiourf']"): vol_auteur = soup.select( "span[class='AuteurNiourf']")[0].text.replace("\n", "").strip() if debug: self.log.info(self.who, "vol_auteur processed : ", vol_auteur) for i in range(len(vol_auteur.split())): if not vol_auteur.split()[i].isupper(): vol_auteur_prenom += " " + vol_auteur.split()[i] else: vol_auteur_nom += " " + vol_auteur.split()[i].title() vol_auteur = vol_auteur.title() vol_auteur_prenom = vol_auteur_prenom.strip() if debug: self.log.info(self.who, "vol_auteur_prenom processed : ", vol_auteur_prenom) vol_auteur_nom = vol_auteur_nom.strip() if debug: self.log.info(self.who, "vol_auteur_nom processed : ", vol_auteur_nom) if soup.select("a[href*='serie.asp']"): if soup.select("a[href*='serie.asp']")[0].find_parent( "span", {"class": "ficheNiourf"}): vol_serie = soup.select("a[href*='serie.asp']")[0].text tmp_vss = [ x for x in soup.select("a[href*='serie.asp']") [0].parent.stripped_strings ] for i in range(len(tmp_vss)): if "vol." in tmp_vss[i]: if not vol_serie_seq: vol_serie_seq = tmp_vss[i].replace("vol.", "").strip() if "découpage" in tmp_vss[i]: dec_anx_url = "https://www.noosfere.org/livres/" + soup.select( "a[href*='serie.asp']")[0]['href'] comment_pre_decoupage_annexe = BS( '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>', "lxml") comment_decoupage_annexe = self.get_decoupage_annexe( dec_anx_url) if debug: self.log.info(self.who, "vol_serie, vol_serie_seq processed : ", vol_serie, ",", vol_serie_seq) comment_generic = soup.select("span[class='ficheNiourf']")[0] new_div = soup.new_tag('div') comment_generic = comment_generic.wrap(new_div) if debug: self.log.info(self.who, "comment_generic processed") if soup.select("a[href*='editeur.asp']"): vol_editor = soup.select("a[href*='editeur.asp']")[0].text if debug: self.log.info(self.who, "vol_editor processed : ", vol_editor) if soup.select("a[href*='collection.asp']"): vol_coll = soup.select("a[href*='collection.asp']")[0].text if debug: self.log.info(self.who, "vol_coll : ", vol_coll) for i in comment_generic.stripped_strings: tmp_lst.append(str(i)) vol_coll_srl = tmp_lst[len(tmp_lst) - 1] if "n°" in vol_coll_srl: for k in ["n°", "(", ")"]: if k in vol_coll_srl: vol_coll_srl = vol_coll_srl.replace(k, "") vol_coll_srl = vol_coll_srl.strip() vol_coll_srl = vol_coll_srl.split("/")[0] if vol_coll_srl[0].isnumeric(): vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:] else: vol_coll_srl = "" if debug: self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl) # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead # note that I 'calculate' the missing day of the month and even sometimes the missing month ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre") for elemnt in soup.select_one( "span[class='sousFicheNiourf']").stripped_strings: if debug: self.log.info(self.who, "elemnt : ", elemnt) if not vol_dp_lgl: elemn = (elemnt.replace("Dépôt légal :", "").split(','))[0].strip() if elemn: if elemn.isnumeric() and len(elemn) == 4: vol_dp_lgl = datetime.datetime.strptime( "175 " + elemn, "%j %Y") elif "semestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:] + " " + ele[2], "%j %Y") elif "trimestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:] + " " + ele[2], "%j %Y") else: for i in range(len(ms)): if ms[i] in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str(10 + 31 * i))[-3:] + " " + ele[1], "%j %Y") break if debug: self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl) if "ISBN" in elemnt: vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '') if "néant" in vol_isbn: vol_isbn = "" if debug: self.log.info(self.who, "vol_isbn processed : ", vol_isbn) if "Genre" in elemnt: vol_genre = elemnt.lstrip("Genre : ") if debug: self.log.info(self.who, "vol_genre processed : ", vol_genre) if soup.select("img[name='couverture']"): for elemnt in repr( soup.select("img[name='couverture']")[0]).split('"'): if "http" in elemnt: if not vol_cover_index: vol_cover_index = elemnt if debug: self.log.info(self.who, "vol_cover_index processed : ", vol_cover_index) # add cover image address as a reference in the comment if vol_cover_index: comment_cover = BS( '<div><p>Couverture: <a href="' + vol_cover_index + '">' + vol_cover_index + '</a></p></div>', "lxml") # select the fields I want... More exist such as film adaptations or references to advises to read # but that is not quite consistant around all the books (noosfere is a common database from many people) # and beside I have enough info like that AND I do NOT want to take out the noosfere's business tmp_comm_lst = soup.select("span[class='AuteurNiourf']") if debug: self.log.info(self.who, tmp_comm_lst) #usefull but too long for i in range(len(tmp_comm_lst)): if "Quatrième de couverture" in str(tmp_comm_lst[i]): comment_resume = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_resume processed") if "Critiques" in str(tmp_comm_lst[i]): if not "autres" in str(tmp_comm_lst[i]): comment_Critiques = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Critiques processed") if "Sommaire" in str(tmp_comm_lst[i]): comment_Sommaire = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Sommaire processed") if "Critiques des autres" in str(tmp_comm_lst[i]): comment_AutresCritique = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if comment_AutresCritique.select('a[href*="serie.asp"]') and ( "Critique de la série" in comment_AutresCritique. select('a[href*="serie.asp"]')[0].text): critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select( 'a[href*="serie.asp"]')[0]['href'] try: more_comment_AutresCritique = self.get_Critique_de_la_serie( critic_url) comment_AutresCritique.append( more_comment_AutresCritique) except: self.log.exception( "get_Critique_de_la_serie failed for url: ", critic_url) if debug: self.log.info(self.who, "comment_AutresCritique processed") # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-)) if comment_cover: vol_comment_soup.append(comment_cover) if comment_generic: vol_comment_soup.append(comment_generic) if comment_resume: vol_comment_soup.append(comment_resume) if comment_Critiques: vol_comment_soup.append(comment_Critiques) if comment_Sommaire: vol_comment_soup.append(comment_Sommaire) if comment_AutresCritique: vol_comment_soup.append(comment_AutresCritique) if comment_decoupage_annexe: vol_comment_soup.append( comment_pre_decoupage_annexe) # this is the title vol_comment_soup.append(comment_decoupage_annexe) # # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning) # - I like to have functional url when they exist # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) ) for elemnt in vol_comment_soup.select('[align="justify"]'): del elemnt['align'] # remove all double or triple 'br' to improve presentation. # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) ) # # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire... # donc vol_comment_soup est modifié... # tmp1 = tmp2 = "lrp_the_unique" for elemnt in vol_comment_soup.findAll(): tmp1, tmp2 = tmp2, elemnt if tmp1 == tmp2: elemnt.extract() br = soup.new_tag('br') for elemnt in vol_comment_soup.select('.AuteurNiourf'): elemnt.insert(0, br) elemnt["style"] = "font-weight: 600; font-size: 18px" if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet avant correction: ", elemnt) for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='/heberg/']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/heberg/", "https://www.noosfere.org/heberg/") for elemnt in vol_comment_soup.select( "a[href*='./EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='heberg']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../../heberg", "https://www.noosfere.org/heberg") for elemnt in vol_comment_soup.select("a[href*='../bd']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../bd", "https://www.noosfere.org/bd") for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='collection.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "collection.asp", "https://www.noosfere.org/livres/collection.asp") for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "critsign.asp", "https://www.noosfere.org/livres/critsign.asp") for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editeur.asp", "https://www.noosfere.org/livres/editeur.asp") for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editionslivre.asp", "https://www.noosfere.org/livres/editionslivre.asp") for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='serie.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "serie.asp", "https://www.noosfere.org/livres/serie.asp") if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet apres correction: ", elemnt) fg, fd = "<<==", "==>>" #chr(0x21D0),chr(0x21D2) #chr(0x27f8),chr(0x27f9) for elemnt in vol_comment_soup.select("img[src*='arrow_left']"): elemnt.replace_with(fg) for elemnt in vol_comment_soup.select("img[src*='arrow_right']"): elemnt.replace_with(fd) # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €) # only set vol_coll_srl if vol_coll exists # the idea is to use search and replace in the edit Metadata in bulk window. if self.extended_publisher: if debug: self.log.info( self.who, """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set""" ) if vol_coll: if debug: self.log.info(self.who, 'add collection') vol_editor = vol_editor + ('§') + vol_coll if vol_coll_srl: if debug: self.log.info(self.who, 'add collection number') vol_editor = vol_editor + ('€') + vol_coll_srl if vol_serie: if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq) else: vol_serie_seq = 1.0 # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage: # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters # Side note: # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) ) # # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien... # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je # lisais mal et je pensais à une incompatibilité avec la structure xml), # vol_comment_soup = vol_comment_soup.encode('ascii', 'xmlcharrefreplace') self.log.info(self.who, "+++" * 25) self.log.info(self.who, "nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) # must be <class 'str'> self.log.info(self.who, "relevance, type() : ", self.relevance, type(self.relevance)) # must be <class 'float'> self.log.info(self.who, "vol_title, type() : ", vol_title, type(vol_title)) # must be <class 'str'> self.log.info( self.who, "vol_auteur, type() : ", vol_auteur, type(vol_auteur)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_auteur_prenom, type() : ", vol_auteur_prenom, type(vol_auteur_prenom)) # must be <class 'str'> self.log.info(self.who, "vol_auteur_nom, type() : ", vol_auteur_nom, type(vol_auteur_nom)) # must be <class 'str'> if vol_serie: self.log.info(self.who, "vol_serie, type() : ", vol_serie, type(vol_serie)) # must be <class 'str'> self.log.info(self.who, "vol_serie_seq, type() : ", vol_serie_seq, type(vol_serie_seq)) # must be <class 'float'> self.log.info(self.who, "vol_editor, type() : ", vol_editor, type(vol_editor)) # must be <class 'str'> self.log.info(self.who, "vol_coll, type() : ", vol_coll, type(vol_coll)) # must be <class 'str'> self.log.info(self.who, "vol_coll_srl, type() : ", vol_coll_srl, type(vol_coll_srl)) # must be <class 'str'> self.log.info( self.who, "vol_dp_lgl, type() : ", vol_dp_lgl, type(vol_dp_lgl) ) # must be <class 'datetime.datetime'> ('renderer=isoformat') self.log.info(self.who, "vol_isbn, type() : ", vol_isbn, type(vol_isbn)) # must be <class 'str'> self.log.info( self.who, "vol_genre, type() : ", vol_genre, type(vol_genre)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_cover_index, type() : ", vol_cover_index, type(vol_cover_index)) # must be self.log.info(self.who, "type(vol_comment_soup) : ", type(vol_comment_soup) ) # must be byte encoded (start with b'blablabla... # self.log.info(self.who,"vol_comment_soup :\n",vol_comment_soup) # Maybe a bit long sometimes # language must be <class 'str'> if vol_cover_index: self.plugin.cache_identifier_to_cover_url(self.nsfr_id, vol_cover_index) if vol_isbn: self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id) mi = Metadata(vol_title, [vol_auteur]) mi.set_identifier('nsfr_id', self.nsfr_id) mi.publisher = vol_editor mi.isbn = vol_isbn mi.tags = [vol_genre] mi.source_relevance = self.relevance mi.has_cover = bool(vol_cover_index) if vol_dp_lgl: mi.pubdate = vol_dp_lgl if vol_serie: mi.series = vol_serie mi.series_index = vol_serie_seq mi.language = "fra" mi.comments = vol_comment_soup if debug: self.log.info(self.who, "mi\n", mi, "\n") self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)