def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [ info.get(u'作者', u'佚名') ] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary() ) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def get_metadata(self, md): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title) mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [ t['name'] for t in book['tags'] ][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def _metadata(self, book): authors = [] if book['author']: for author in book['author']: for r in REMOVES: author = r.sub("", author) authors.append(author) if not authors: authors = [u'佚名'] from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO mi = Metadata(book['title']) mi.authors = authors mi.author_sort = mi.authors[0] mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) mi.website = "https://book.douban.com/isbn/%s" % mi.isbn mi.source = u'豆瓣' mi.cover_url = book['images']['large'] if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) logging.debug("=================\ndouban metadata:\n%s" % mi) return mi
def parse(self, xml_detail): data = xml_detail.split('\n')[1].split("|") self.log(data) title = data[1] authors = [data[0]] comments = data[13] isbn = data[3] publisher = data[6] pub_date_tmp = data[34].split('-') pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz) if isbn is not None: isbn_tmp = re.sub("-", "", isbn) cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp) else: cover = None if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.publisher = publisher mi.pubdate = pub_date mi.isbn = isbn mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: return None
def get_metadata(self, md, select): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title, md.author_sort, select) if not book: return None mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) #logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def get_baike_metadata(self, title): from baidubaike import Page try: baike = Page(title) except: return None info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = info.get(u'首发网站', None) if not plat: plat = info.get(u'首发状态', "网络小说平台") plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info[u'作者']] mi.isbn = '0000000000001' mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.comments = baike.get_summary() if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def parse(self, xml_detail): sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None authors = [] tags = [] xpath = self.XPath('//table[@id="record"]//tr') for row in xpath(xml_detail): ch = row.getchildren() txt = ch[0].text.strip() data = self.normalize(ch[1].text) if txt.startswith('245') and title is None: title = self.parse_title(data) if txt.startswith('246'): title = self.parse_title(data) elif txt.startswith('100') or txt.startswith('700'): res = self.parse_author(data) if res is not None: authors.append(res) elif txt == 'SYS': sys_ident = data.strip() elif txt =='020': isbn = self.parse_isbn(data) elif txt == '260': publisher, pub_year = self.parse_publisher(data) elif txt.startswith('490') and serie is None: serie, serie_index = self.parse_serie(data) elif txt == '655 7': tags.append(self.parse_tags(data)) if isbn is not None and isbn != '': cover = self.parse_cover(isbn) if title is not None and len(authors) > 0 and sys_ident is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.identifiers = {self.plugin.name:sys_ident} mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(sys_ident, cover) return mi else: self.log('Data not found') return None
def parse_response(cls, response, isbn_initial, log): metadata_items = [] page_soup = BeautifulSoup(response.text) for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1): title = cls.find(candidate, 'b-result__name-wrap', True) author = map( unicode.strip, cls.find(candidate, 'b-result__author', True).split(',')) comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip() isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip() log.info(u'Found candidate %s: %s' % (idx, title)) publisher = None pubdate = None other_info = cls.find(candidate, 'b-result__years', True).strip() if other_info: for entry in other_info.split(';'): k, v = entry.split(':', 1) k = k.strip() if k == u'Год': pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip()) elif k == u'Издательство': publisher = v.strip() metadata_item = Metadata(title, author) metadata_item.isbn = isbn or isbn_initial if comments: metadata_item.comments = comments if publisher is not None: metadata_item.publisher = publisher if pubdate is not None: metadata_item.pubdate = pubdate metadata_items.append(metadata_item) return metadata_items
def to_metadata(self, browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars # log.info('entry_ is: ',entry_) id_url = entry_['url'] douban_id = entry_['id'] title_ = entry_['title'] subtitle = entry_['subtitle'] authors = [x.strip() for x in entry_['author'] if x] if not authors: authors = [_('Unknown')] mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} mi.comments = entry_['summary'] mi.publisher = entry_['publisher'] # ISBN mi.isbn = entry_['isbn10'] mi.all_isbns = [entry_['isbn10'], entry_['isbn13']] # Tags mi.tags = [x['name'].strip() for x in entry_['tags']] # pubdate pubdate = entry_['pubdate'] if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings mi.rating = float(entry_['rating']['average']) / 2.0 # Cover mi.has_douban_cover = entry_['image'] return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data[ 'pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def parse_response(cls, response, isbn_initial, log): metadata_items = [] page_soup = BeautifulSoup(response.text) for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1): title = cls.find(candidate, 'b-result__name-wrap', True) author = map(unicode.strip, cls.find(candidate, 'b-result__author', True).split(',')) comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip() isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip() log.info(u'Found candidate %s: %s' % (idx, title)) publisher = None pubdate = None other_info = cls.find(candidate, 'b-result__years', True).strip() if other_info: for entry in other_info.split(';'): k, v = entry.split(':', 1) k = k.strip() if k == u'Год': pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip()) elif k == u'Издательство': publisher = v.strip() metadata_item = Metadata(title, author) metadata_item.isbn = isbn or isbn_initial if comments: metadata_item.comments = comments if publisher is not None: metadata_item.publisher = publisher if pubdate is not None: metadata_item.pubdate = pubdate metadata_items.append(metadata_item) return metadata_items
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata info = baike.get_info() logging.debug("\n".join("%s:\t%s" % v for v in info.items())) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' mi.provider_key = KEY mi.provider_value = baike.get_id() if self.copy_image and mi.cover_url: logging.debug("fetching cover: %s", mi.cover_url) img = io.BytesIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def _check_proceed_with_extracted_isbns(self, payload): extracted_ids, _same_isbn_ids, _failed_ids = payload modified = set() db = self.gui.current_db for i, title, last_modified, isbn in extracted_ids: lm = db.metadata_last_modified(i, index_is_id=True) if lm > last_modified: title = db.title(i, index_is_id=True) authors = db.authors(i, index_is_id=True) if authors: authors = [x.replace('|', ',') for x in authors.split(',')] title += ' - ' + authors_to_string(authors) modified.add(title) if modified: from calibre.utils.icu import lower modified = sorted(modified, key=lower) if not question_dialog(self.gui, _('Some books changed'), '<p>'+ _('The metadata for some books in your library has' ' changed since you started the download. If you' ' proceed, some of those changes may be overwritten. ' 'Click "Show details" to see the list of changed books. ' 'Do you want to proceed?'), det_msg='\n'.join(modified)): return # At this point we want to re-use code in edit_metadata to go ahead and # apply the changes. So we will replace the Metadata objects with some # empty ones with only the isbn field set so only that field gets updated id_map = {} for i, title, last_modified, isbn in extracted_ids: mi = Metadata(_('Unknown')) mi.isbn = isbn id_map[i] = mi edit_metadata_action = self.gui.iactions['Edit Metadata'] edit_metadata_action.apply_metadata_changes(id_map, callback=self._mark_and_display_results)
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join("%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "網絡小說平台" plat = info.get(u'首發狀態', plat) plat = info.get(u'首發網站', plat) plat = plat.replace(u'首發', '') mi.publisher = info.get(u'連載平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完結' in info.get(u'連載狀態', ""): day = re.findall('\d*-\d*-\d*', info[u'連載狀態']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text.replace('http://', 'https://') douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban':douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi
def to_metadata(self, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow log.info("to_metadata") douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") # authors = "author" book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("cover") series = entry_.get("series") if not authors: authors = [("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = book_tags # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except BaseException: log.error("Failed to parse pubdate %r" % pubdate) if rating: try: # mi.publisher += "#PrB.rating#" + str(rating) mi.rating = rating / 2.0 except BaseException: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series return mi
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi
def parse(self, raw, desc_raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, utcnow import json root = parse_html(raw.decode('gb18030')) title = root.xpath('//*[@id="name"]/div[1]/text()') title = title[0].strip() authors = [] for i in root.xpath('//*[@id="p-author"]/a'): authors.append(i.text.strip()) mi = Metadata(title, authors) information = root.xpath('//*[@id="parameter2"]/li') info = dict() for i in information: tmp = etree.tostring(i, method='text', encoding='utf-8').split(u':') info[tmp[0].strip()] = tmp[1].strip() # Identifiers mi.identifiers = self.plugin.identifiers mi.identifiers['jd'] = self.sku isbn = info['ISBN'] self.log.error(isbn) if isbn: mi.isbn = isbn self.plugin.cache_isbn_to_identifier(isbn, self.sku) mi.identifiers['isbn'] = isbn # Publisher mi.publisher = info.get(u'出版社') # Pubdate pubdate = info.get(u'出版时间') if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: self.log.error('Failed to parse pubdate %r' % pubdate) # Series mi.series = info.get(u'丛书名') img = root.xpath('//*[@id="spec-n1"]/img') cover = img[0].get('src') if cover: if not cover.startswith('http'): cover = 'https:' + cover self.plugin.cache_identifier_to_cover_url(self.sku, cover) self.log.error(cover) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None # Comments # showdesc({"date":1583588455348,"content":" ... "}) try: desc = json.loads(desc_raw[9:-1].decode('gb18030')) desc_root = parse_html(desc['content']) div = desc_root.xpath( '//*[@id="detail-tag-id-3"]/div[2]/div/text()') comments = div[0] mi.comments = comments finally: return mi
def retrieve_bokelai_detail(self, bokelai_id, log, result_queue, timeout): detail_url = self.BOKELAI_DETAIL_URL % bokelai_id log.info(detail_url) try: br = self.browser _raw = br.open_novisit(detail_url, timeout=timeout) raw = _raw.read() except Exception as e: log.exception('Failed to load detail page: %s' % detail_url) return root = etree.HTML(raw) info_json_text = root.xpath( "//script[@type='application/ld+json']")[0].text log.info(info_json_text) info_json = json.loads(info_json_text) title = info_json['name'] authors = info_json['author'][0]['name'].split(",") publisher = info_json['publisher'][0]['name'] isbn = info_json['workExample']['workExample']['isbn'] pubdate = info_json['datePublished'] comments = "" comments_ele = root.xpath("(//div[@class='content'])[1]//text()") comments = "\n".join(comments_ele) tags = list() for ele in root.xpath("//li[contains(text(),'本書分類:')]/a"): log.info(ele.text) if "/" in ele.text: tags.extend(ele.text.split("/")) if "/" in ele.text: tags.extend(ele.text.split("/")) else: tags.append(ele.text) cover_url = re.search(r'https[^\?\=\&]*' + bokelai_id + r'[^\?\=\&]*', info_json['image']).group(0) if not authors: authors = [_('Unknown')] log.info(title, authors, publisher, isbn, pubdate, comments, tags, cover_url) mi = Metadata(title, authors) mi.identifiers = {'bokelai': bokelai_id, 'isbn': isbn} mi.publisher = publisher mi.comments = comments mi.isbn = isbn mi.tags = tags if pubdate: try: from calibre.utils.date import parse_date, utcnow default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) if not cover_url is None: mi.has_bokelai_cover = cover_url self.cache_identifier_to_cover_url(mi.identifiers['bokelai'], mi.has_bokelai_cover) else: mi.has_bokelai_cover = None result_queue.put(mi)
def parse_details(self, root): try: legie_id = self.parse_legie_id(self.url) except: self.log.exception('Error parsing Legie id for url: %r' % self.url) legie_id = None try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not legie_id: self.log.error('Could not find title/authors/Legie id for %r' % self.url) self.log.error('Legie: %r Title: %r Authors: %r' % (legie_id, title, authors)) return self.legie_id = legie_id rating = comments = series = series_index = None try: rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: (series, series_index) = self.parse_series(root) except: self.log.info('Series not found.') try: tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) tags = None if legie_id: editions = self.get_editions() if editions: num_editions = len(editions) self.log.info('Nalezeno %d vydani' % num_editions) for edition in editions: (year, cover_url, publisher, isbn) = edition mi = Metadata(title, authors) self.legie_id = "%s#%s" % (legie_id, year) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index if cover_url: mi.cover_url = self.cover_url = cover_url self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) else: mi = Metadata(title, authors) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) if self.legie_id: if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r'%pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the title of the book try: title_node = root.xpath('//span[@itemprop="name"]') self.title = title_node[0].text except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath('//span[@class="expandAuthorName"]') author_strings = author_node[0].text.split(",") #print(author_strings) for name in author_strings: self.authors.append(name) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Get the series of the book try: series_node = root.xpath('//b[contains(text(), "Serie")]/a') if len(series_node) > 0: self.series = series_node[0].text.split(": ")[0].strip() self.series_index = series_node[0].text.split(": ")[-1].strip() # print("'%s'" % self.series) # print("'%s'" % self.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) # Some books have ratings, let's use them. try: self.rating = 0.0 except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: isbn_node = root.xpath( '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]' ) if len(isbn_node) > 0: self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip() except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: comment_node = root.xpath('//meta[@name="description"]/@content') self.comments = comment_node[0] except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: cover_node = root.xpath( '//div[@class="bookDetailCoverCover"]/img/@src') self.cover_url = "https://mofibo.com" + cover_node[0] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: publisher_node = root.xpath( '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]') if len(publisher_node) > 0: self.publisher = publisher_node[0].text except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language_node = root.xpath('//b[@class="expanderLanguage"]') language = language_node[0].text.strip().replace("Sprog:", "").replace( " ", "") language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: pubdate_node = root.xpath( '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]' ) if len(pubdate_node) > 0: date_str = pubdate_node[0].text.replace("Udgivet:", "").strip() format_str = '%Y-%m-%d' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Get the tags try: tags = [] tags_node = root.xpath('//span[@itemprop="category"]') tags.append(tags_node[0].text.strip()) self.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('mofibo', self.url) # Set rating if self.series: try: meta_data.series = self.series meta_data.series_index = self.series_index except: self.log.exception('Error loading series') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Set tags data if self.tags: try: meta_data.tags = self.tags except: self.log.exception('Error loading tags') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30): self.load_config() if authors is None: authors=[] # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ] for i in ignored_authors: authors = [ x for x in authors if x != i ] if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).") return None queries=[] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: exact_search['idn'] = idn # in case look for a IDN only search for the IDN and skip all the other stuff queries.append('num='+idn) else: authors_v = [] title_v = [] # create some variants of given authors if authors != []: authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False))) # concat all author names ("Peter Meier Luise Stark") authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True))) # use only first author for a in authors: authors_v.append(a) # use all authors, one by one # remove duplicates unique_authors_v = [] for i in authors_v: if i not in unique_authors_v: unique_authors_v.append(i) # create some variants of given title if title is not None: title_v.append(title) # simply use given title title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False))) # remove some punctation characters title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True))) # remove subtitle (everything after " : ") title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False))) # remove some punctation characters and joiners ("and", "&", ...) title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True))) # remove subtitle (everything after " : ") and joiners ("and", "&", ...) # TODO: remove subtitle after " - " # remove duplicates unique_title_v = [] for i in title_v: if i not in unique_title_v: unique_title_v.append(i) # title and author if authors_v != [] and title_v != []: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with first author as title and title (without subtitle) as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # try with author and title (without subtitle) in any index if isbn is not None: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # author but no title elif authors_v != [] and title_v == []: for i in authors_v: if isbn is not None: queries.append('per="'+ i +'" AND num="' + isbn + '"') else: queries.append('per="'+ i +'"') # try with author as title if isbn is not None: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"') else: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # title but no author elif authors_v == [] and title_v != []: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with title as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # remove duplicate queries uniqueQueries=[] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: # SRU does not work with "+" or "?" characters in query, so we simply remove them query = re.sub('[\+\?]','',query) query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' } for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None authors = [] author_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] publisher_name = None publisher_location = None ##### Field 264 ##### # Publisher Name and Location fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); # Publishing Date for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0) break # Log if publisher_name is not None: log.info("Extracted Publisher: %s" % publisher_name) if publisher_location is not None: log.info("Extracted Publisher Location: %s" % publisher_location) if pubdate is not None: log.info("Extracted Publication Year: %s" % pubdate) ##### Field 245 #### # Title/Series/Series_Index title_parts = [] for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # if a,n,p,n,p,n,p exist: series = a + n0 + " - " + p0 + n1 + " - " + p1, series_index = n2, title = p2 # if a,n,p,n,p exist: series = a + n0 + " - " + p0, series_index = n1, title = p1 (Example: dnb-id 1008774839) # if a,n,p exist: series = a, series_index = n, title = p # if a exist: title = a # TODO: a,n,p,n (i.e. 956375146) code_p = [] code_n = [] code_a = [] for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns): code_p.append(j.text.strip()) for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns): match = re.search("(\d+[,\.\d+]?)", j.text.strip()) if match: code_n.append(match.group(1)) else: code_n.append("0") # looks like sometimes DNB does not know the series index and uses something like "[...]" for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns): code_a.append(j.text.strip()) if len(code_p) == 0: title_parts = title_parts + code_a elif len(code_p)>0 and len(code_p) == len(code_n): series = " : ".join(code_a) # I've never seen more than one code_a, but who knows... for i in range (0,len(code_p)-1): series = series + " " + code_n[i] + " " + code_p[i] series_index = code_n[-1] title_parts.append(code_p[-1]) # subtitle 1: Field 245 for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) # Log if series_index is not None: log.info("Extracted Series_Index from Field 245: %s" % series_index) if series is not None: log.info("Extracted Series from Field 245: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if title is not None: log.info("Extracted Title: %s" % title) title = self.cleanUpTitle(log, title) # Title_Sort if len(title_parts)>0: title_sort_parts = list(title_parts) title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword])) title_sort = " : ".join(title_sort_parts) # Log if title_sort is not None: log.info("Extracted Title_Sort: %s" % title_sort) ##### Field 100 and Field 700 ##### # Authors for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # primary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)==0: # if no "real" autor was found take all persons involved for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)>0: author_sort = authors[0] # Log if len(authors)>0: log.info("Extracted Authors: %s" % " & ".join(authors)) if author_sort is not None: log.info("Extracted Author_Sort: %s" % " & ".join(authors)) ##### Field 856 ##### # Comments for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) # Log if comments is not None: log.info('Comments: %s' % comments) # If no comments are found for this edition, look at other editions of this book (Fields 776) # TODO: Make this configurable (default: yes) if comments is None: # get all other issues for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns): other_idn = re.sub("^\(.*\)","",i.text.strip()); subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(subquery) if self.cfg_dnb_token is None: subresults = self.getSearchResultsByScraping(log, subquery, timeout) else: subresults = self.getSearchResults(log, subquery, timeout) if subresults is None: continue for subrecord in subresults: for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) if comments is not None: log.info('Comments from other issue: %s' % comments) break ##### Field 16 ##### # ID: IDN for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): idn = i.text.strip() break # Log if idn is not None: log.info("Extracted ID IDN: %s" % idn) ##### Field 24 ##### # ID: URN for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): urn = i.text.strip() break # Log if urn is not None: log.info("Extracted ID URN: %s" % urn) ##### Field 20 ##### # ID: ISBN for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-','') break # Log if isbn is not None: log.info("Extracted ID ISBN: %s" % isbn) # When doing an exact search for a given ISBN skip books with wrong ISBNs if isbn is not None and "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info("Extracted ISBN does not match book's ISBN, skipping record") continue ##### Field 82 ##### # ID: Sachgruppe (DDC) for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): ddc.append(i.text.strip()) # Log if len(ddc)>0: log.info("Extracted ID DDC: %s" % ",".join(ddc)) ##### Field 490 ##### # In theory this field is not used for "real" book series, use field 830 instead. But it is used. # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() parts = re.split(" : ",attr_v) if len(parts)==2: if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.search("(\d+[,\.\d+]?)", indexpart) if match is not None: series_index = match.group(1) series = textpart.strip() else: match = re.search("(\d+[,\.\d+]?)", attr_v) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Use Series Name from attribute "a" if not already found in attribute "v" if series is None: series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 490: %s" % series_index) if series is not None: log.info("Extracted Series from Field 490: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 246 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip()) if match is not None: series = match.group(1) series_index = match.group(2) # Log if series_index is not None: log.info("Extracted Series Index from Field 246: %s" % series_index) if series is not None: log.info("Extracted Series from Field 246: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 800 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 800: %s" % series_index) if series is not None: log.info("Extracted Series from Field 800: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 830 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 830: %s" % series_index) if series is not None: log.info("Extracted Series from Field 830: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 689 ##### # GND Subjects for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): subjects_gnd.append(i.text.strip()) for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) # Log if len(subjects_gnd)>0: log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) ##### Fields 600-655 ##### # Non-GND subjects for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;',i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i)<2: subjects_non_gnd.remove(i) # Log if len(subjects_non_gnd)>0: log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) ##### Field 250 ##### # Edition for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): edition = i.text.strip() break # Log if edition is not None: log.info("Extracted Edition: %s" % edition) ##### Field 41 ##### # Languages for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): languages.append(i.text.strip()) # Log if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) ##### If configured: Try to separate Series, Series Index and Title from the fetched title ##### #if self.cfg_guess_series is True: if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True: guessed_series = None guessed_series_index = None guessed_title = None log.info("Starting Series Guesser") parts = re.split("[:]",self.removeSortingCharacters(title)) if len(parts)==2: log.info("Title has two parts") # make sure only one part of the two parts contains digits if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): log.info("only one title part contains digits") # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] # Look at the part without digits: match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # Look at the part with digits: # for Titleparts like: "Name of the series - Episode 2" match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO1: guessed_title: " + guessed_title) #log.info("ALGO1: guessed_series: " + guessed_series) #log.info("ALGO1: guessed_series_index: " + guessed_series_index) else: # for Titleparts like: "Episode 2 Name of the series" match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: # sometimes books with multiple volumes are detected as series without name -> Add the volume to the title guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO2: guessed_title: " + guessed_title) #log.info("ALGO2: guessed_series: " + guessed_series) #log.info("ALGO2: guessed_series_index: " + guessed_series_index) else: # for titleparts like: "Band 2" match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) # ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE # some false positives match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart) if match: guessed_series = match.group(1) guessed_title = match.group(2) log.info("ALGO3: guessed_title: " + guessed_title) log.info("ALGO3: guessed_series: " + guessed_series) log.info("ALGO3: guessed_series_index: " + guessed_series_index) elif len(parts)==1: log.info("Title has one part") # for Titles like: "Name of the series - Title (Episode 2)" match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) guessed_title = match.group(2) #log.info("ALGO4: guessed_title: " + guessed_title) #log.info("ALGO4: guessed_series: " + guessed_series) #log.info("ALGO4: guessed_series_index: " + guessed_series_index) else: # for Titles like: "Name of the series - Episode 2" match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) guessed_title = guessed_series + " : Band " + guessed_series_index #log.info("ALGO5: guessed_title: " + guessed_title) #log.info("ALGO5: guessed_series: " + guessed_series) #log.info("ALGO5: guessed_series_index: " + guessed_series_index) # Log if guessed_series is not None: log.info("Guessed Series: %s" % guessed_series) #guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name) if guessed_series_index is not None: log.info("Guessed Series Index: %s" % guessed_series_index) if guessed_title is not None: log.info("Guessed Title: %s" % guessed_title) guessed_title = self.cleanUpTitle(log, guessed_title) if guessed_series is not None and guessed_series_index is not None and guessed_title is not None: title = guessed_title series = guessed_series series_index = guessed_series_index ##### Filter exact searches ##### # When doing an exact search for a given IDN skip books with wrong IDNs # TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions if idn is not None and "idn" in exact_search: if idn != exact_search["idn"]: log.info("Extracted IDN does not match book's IDN, skipping record") continue ##### Put it all together ##### if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn',urn) mi.set_identifier('dnb-idn',idn) mi.set_identifier('ddc', ",".join(ddc)) # cfg_subjects: # 0: use only subjects_gnd if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) # 1: use only subjects_gnd if found, else subjects_non_gnd elif self.cfg_fetch_subjects == 1: if len(subjects_gnd)>0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) # 2: subjects_gnd and subjects_non_gnd elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) # 3: use only subjects_non_gnd if found, else subjects_gnd elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd)>0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) # 4: use only subjects_non_gnd elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) # 5: use no subjects at all elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: \n%s" % mi) result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [ re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',') ] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r' % pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
#!/usr/bin/env python
def parse_details(self, root): try: title = self.parse_title(root) except: self.log.exception('Error parsing title for query: %r' % self.query) title = None if not title: self.log.error('Could not find title for %r' % self.query) try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for query: %r' % self.query) authors = [] if not authors: self.log.error('Could not find authors for %r' % self.query) return mi = Metadata(title, authors) try: isbn = self.parse_isbn(root) if isbn: # match 10 of 13 getallen aan het begin, gevolgd door een spatie of niets p = re.compile('^([0-9]{13}|[0-9]{10})(?= |\Z)') if isinstance(isbn, str): m = p.match(isbn) if m: mi.isbn = m.group() else: m = p.match(isbn[0]) if m: mi.isbn = m.group() except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: lang = self.parse_language(root) if lang: mi.languages = lang except: self.log.exception('Error parsing language for url: %r' % self.url) try: lccn = self.parse_lccn(root) if lccn: if isinstance(lccn, str): mi.set_identifier('lccn', lccn) else: for identifier in lccn: mi.set_identifier('lccn', identifier) except: self.log.exception('Error parsing LCCN for url: %r' % self.url) try: ddc = self.parse_ddc(root) if ddc: if isinstance(ddc, str): mi.set_identifier('ddc', ddc) else: for identifier in ddc: mi.set_identifier('ddc', identifier) except: self.log.exception('Error parsing DDC for url: %r' % self.url) try: lcc = self.parse_lcc(root) if lcc: if isinstance(lcc, str): mi.set_identifier('lcc', lcc) else: for identifier in lcc: mi.set_identifier('lcc', identifier) except: self.log.exception('Error parsing LCC for url: %r' % self.url) mi.source_relevance = self.relevance self.result_queue.put(mi)
def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers): from lxml import etree def tostring(x): if x is None: return '' return etree.tostring(x, method='text', encoding=unicode).strip() orig_isbn = identifiers.get('isbn', None) title_tokens = list(self.get_title_tokens(orig_title)) author_tokens = list(self.get_author_tokens(orig_authors)) results = [] def ismatch(title, authors): authors = lower(' '.join(authors)) title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break amatch = not author_tokens for a in author_tokens: if lower(a) in authors: amatch = True break if not author_tokens: amatch = True return match and amatch bl = feed.find('BookList') if bl is None: err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) for bd in bl.xpath('.//BookData'): isbn = check_isbn(bd.get('isbn', None)) isbn13 = check_isbn(bd.get('isbn13', None)) if not isbn and not isbn13: continue if orig_isbn and orig_isbn not in {isbn, isbn13}: continue title = tostring(bd.find('Title')) if not title: continue authors = [] for au in bd.xpath('.//Authors/Person'): au = tostring(au) if au: if ',' in au: ln, _, fn = au.partition(',') au = fn.strip() + ' ' + ln.strip() authors.append(au) if not authors: continue comments = tostring(bd.find('Summary')) id_ = (title, tuple(authors)) if id_ in seen: continue seen.add(id_) if not ismatch(title, authors): continue publisher = tostring(bd.find('PublisherText')) if not publisher: publisher = None if publisher and 'audio' in publisher.lower(): continue mi = Metadata(title, authors) mi.isbn = isbn mi.publisher = publisher mi.comments = comments results.append(mi) return total_results, shown_results, results
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE root = parse_html(raw) sku = CSSSelect("div.sku.attGroup")(root)[0] info = sku.getparent() top = info.getparent().getparent() banner = top.find("div") spans = banner.findall("span") title = "" for i, span in enumerate(spans): if i == 0 or "12pt" in span.get("style", ""): title += astext(span) else: break authors = [re.sub(r"\(.*\)", "", x).strip() for x in astext(spans[-1]).split(",")] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(",")] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier("edelweiss", self.sku) # Tags bisac = CSSSelect("div.bisac.attGroup")(root) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(",")] mi.tags = [t[1:].strip() if t.startswith("&") else t for t in mi.tags] # Publisher pub = CSSSelect("div.supplier.attGroup")(root) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = CSSSelect("div.shipDate.attGroupItem")(root) if pub: pub = astext(pub[0]) parts = pub.partition(":")[0::2] pub = parts[1] or parts[0] try: if ", Ship Date:" in pub: pub = pub.partition(", Ship Date:")[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception("Error parsing published date: %r" % pub) # Comments comm = "" general = CSSSelect("div#pd-general-overview-content")(root) if general: q = self.render_comments(general[0]) if q != "<p>No title summary available. </p>": comm += q general = CSSSelect("div#pd-general-contributor-content")(root) if general: comm += self.render_comments(general[0]) general = CSSSelect("div#pd-general-quotes-content")(root) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = CSSSelect("img.title-image[src]")(root) if img: href = img[0].get("src").replace("jacket_covers/medium/", "jacket_covers/flyout/") self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def _GoodreadsBook_to_Metadata(self, book): # type: (_GoodreadsBook) -> Metadata """ :param book: _GoodreadsBook: book :return: Metadata: Metadata """ mi = Metadata(book.title, book.authors) mi.source_relevance = 0 mi.set_identifier('goodreads', book.id) if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get( 'isbn'): mi.set_identifier('isbn', '') if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']: mi.set_identifier('amazon', book.asin) if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']: try: if len(book.isbn) == 10: mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn)) else: mi.isbn = check_isbn13(book.isbn) except: self.log.error("ISBN CONVERSION ERROR:", book.isbn) self.log.exception() if book.image_url: self.log.info('cache_identifier_to_cover_url:', book.asin, ':', book.image_url) self.cache_identifier_to_cover_url(book.id, book.image_url) if book.publisher: self.log.info('book.publisher is:', book.publisher) mi.publisher = book.publisher if book.pubdate: self.log.info('book.pubdate is:', book.pubdate.strftime('%Y-%m-%d')) mi.pubdate = book.pubdate if book.comments: self.log.info('book.editorial_review is:', book.comments) mi.comments = book.comments tags = self.prefs['ADD_THESE_TAGS'].split(',') tags.extend(book.tags) # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings'] # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags)))) if book.series: mi.series = book.series self.log.info(u'series:', book.series) if book.series_index: mi.series_index = book.series_index self.log.info(u'series_index:', "{0:.2f}".format(book.series_index)) else: mi.series_index = 0 if book.average_rating: mi.rating = book.average_rating self.clean_downloaded_metadata(mi) return mi
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers): from lxml import etree def tostring(x): if x is None: return '' return etree.tostring(x, method='text', encoding=unicode).strip() orig_isbn = identifiers.get('isbn', None) title_tokens = list(self.get_title_tokens(orig_title)) author_tokens = list(self.get_author_tokens(orig_authors)) results = [] def ismatch(title, authors): authors = lower(' '.join(authors)) title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break amatch = not author_tokens for a in author_tokens: if lower(a) in authors: amatch = True break if not author_tokens: amatch = True return match and amatch bl = feed.find('BookList') if bl is None: err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) for bd in bl.xpath('.//BookData'): isbn = check_isbn(bd.get('isbn', None)) isbn13 = check_isbn(bd.get('isbn13', None)) if not isbn and not isbn13: continue if orig_isbn and orig_isbn not in {isbn, isbn13}: continue title = tostring(bd.find('Title')) if not title: continue authors = [] for au in bd.xpath('.//Authors/Person'): au = tostring(au) if au: if ',' in au: ln, _, fn = au.partition(',') au = fn.strip() + ' ' + ln.strip() authors.append(au) if not authors: continue comments = tostring(bd.find('Summary')) id_ = (title, tuple(authors)) if id_ in seen: continue seen.add(id_) if not ismatch(title, authors): continue publisher = tostring(bd.find('PublisherText')) if not publisher: publisher = None if publisher and 'audio' in publisher.lower(): continue mi = Metadata(title, authors) mi.isbn = isbn mi.publisher = publisher mi.comments = comments results.append(mi) return total_results, shown_results, results
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the json data within the HTML code (some stuff is easier to get with json) try: json_raw = root.xpath('(//script[@type="application/ld+json"])[2]') json_root = json.loads(json_raw[0].text.strip()) #print(json.dumps(json_root, indent=4, sort_keys=True)) except: self.log.error("Error loading JSON data") return # Get the title of the book try: self.title = json_root['name'] except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath( '//h2[@class="product-page-heading__autor"]//a') for name in author_node: self.authors.append(name.text.strip()) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Some books have ratings, let's use them. try: self.rating = float(json_root['aggregateRating']['ratingValue']) except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: self.isbn = json_root['isbn'] except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: self.comments = parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: self.cover_url = json_root['image'] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: self.publisher = json_root['publisher']['name'] except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language = json_root['inLanguage']['name'] language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy pubdate_node = root.xpath( '//div[@class="product-page-block__container"]//dd' ) # Format dd-mm-yyyy date_str = pubdate_node[0].text.strip() format_str = '%d-%m-%Y' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('saxo', self.url) # Set rating if self.rating: try: meta_data.rating = self.rating except: self.log.exception('Error loading rating') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') rating = XPath('descendant::gd:rating[@average]') def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google':google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings for x in rating(extra): try: mi.rating = float(x.get('average')) if mi.rating > 5: mi.rating /= 2 except: log.exception('Failed to parse rating') # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): mi.has_google_cover = x.get('href') break return mi
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): self.load_config() # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt": authors = None if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info( "This plugin requires at least either ISBN, IDN, Title or Author(s)." ) return None queries = [] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: queries.append('num=' + idn) exact_search['idn'] = idn else: authors_v = [] title_v = [] if authors is not None: authors_v.append(' '.join(authors)) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=False))) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=True))) if title is not None: title_v.append(title) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False))) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))) if isbn is not None: exact_search['isbn'] = isbn # title and author if authors is not None and title is not None: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '" AND tit="' + authors[0] + '"') # title but no author elif authors is not None and title is None: for i in authors_v: if isbn is not None: queries.append('per="' + i + '" AND num="' + isbn + '"') else: queries.append('per="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('tit="' + authors[0] + '"') # author but no title elif authors is None and title is not None: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # Sort queries descending by length (assumption: longer query -> less but better results) #queries.sort(key=len) #queries.reverse() # remove duplicate queries uniqueQueries = [] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = {'marc21': 'http://www.loc.gov/MARC21/slim'} for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] # Title: Field 245 title_parts = [] # if a,n,p exist: series = a, series_index = n, title = p for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..", namespaces=ns): series_index = i.xpath(".//marc21:subfield[@code='n']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match: series_index = match.group(1) else: series_index = "0" # looks like sometimes DNB does not know the series index and uses something like "[...]" series_index = series_index.replace(',', '.') series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() title_parts.append( i.xpath(".//marc21:subfield[@code='p']", namespaces=ns)[0].text.strip()) log.info("Extracted Series: %s" % series) log.info("Extracted Series Index: %s" % series_index) break # otherwise: title = a if len(title_parts) == 0: for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 1 for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) log.info("Extracted Title: %s" % title) # Title_Sort title_sort_parts = list(title_parts) title_sort_regex = re.match( '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$', title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join( filter(None, [ title_sort_regex.group(1).strip(), title_sort_regex.group(3).strip(), ", " + sortword ])) title_sort = " : ".join(title_sort_parts) log.info("Extracted Title_Sort: %s" % title_sort) # Authors authors = [] author_sort = None for i in record.xpath( ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # primary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len( authors ) == 0: # if no "real" autor was found take all persons involved for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len(authors) > 0: author_sort = authors[0] log.info("Extracted Authors: %s" % " & ".join(authors)) # Comments for i in record.xpath( ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]", namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = sanitize_comments_html(comments) log.info('Comments: %s' % comments) break except: log.info("Could not download Comments from %s" % i) # Publisher Name and Location publisher_name = None publisher_location = None fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() log.info("Extracted Publisher: %s" % publisher_name) log.info("Extracted Publisher Location: %s" % publisher_location) # Publishing Date for i in record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]", namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 2) break log.info("Extracted Publication Year: %s" % pubdate) # ID: IDN for i in record.xpath( ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): idn = i.text.strip() break log.info("Extracted ID IDN: %s" % idn) if "idn" in exact_search: if idn != exact_search["idn"]: log.info( "Extracted IDN does not match book's IDN, skipping record" ) continue # ID: URN for i in record.xpath( ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): urn = i.text.strip() break log.info("Extracted ID URN: %s" % urn) # ID: ISBN for i in record.xpath( ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-', '') break log.info("Extracted ID ISBN: %s" % isbn) if "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info( "Extracted ISBN does not match book's ISBN, skipping record" ) continue # ID: Sachgruppe (DDC) for i in record.xpath( ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): ddc.append(i.text.strip()) log.info("Extracted ID DDC: %s" % ",".join(ddc)) # Series and Series_Index if series is None and series_index is None: for i in record.xpath( ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',', '.') log.info("Extracted Series Index: %s" % series_index) # Series series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() log.info("Extracted Series: %s" % series) break # Try to extract Series, Series Index and Title from the fetched title. # Caution: This overwrites DNB's series/series_index and modifies the title! if self.cfg_guess_series is True: guessed_series = None guessed_series_index = None parts = re.split("[:]", self.removeSortingCharacters(title)) if len(parts) == 2: if bool(re.search("\d", parts[0])) != bool( re.search("\d", parts[1])): # figure out which part contains the index if bool(re.search("\d", parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.match( "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart ) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # from Titleparts like: "Name of the series - Episode 2" OK match = re.match( "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart else: # from Titleparts like: "Episode 2 Name of the series" match = re.match( "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$", indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart elif len(parts) == 1: # from Titles like: "Name of the series - Title (Episode 2)" match = re.match( "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) title = match.group(2) else: # from Titles like: "Name of the series - Episode 2" match = re.match( "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) title = guessed_series + " : Band " + guessed_series_index if guessed_series is not None and guessed_series_index is not None: series = guessed_series series_index = guessed_series_index log.info("Guessed Series: %s" % series) log.info("Guessed Series Index: %s" % series_index) # GND Subjects from 689 for i in record.xpath( ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): subjects_gnd.append(i.text.strip()) # GND Subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) # Non-GND subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;', i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i) < 2: subjects_non_gnd.remove(i) log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) # Edition for i in record.xpath( ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): edition = i.text.strip() break log.info("Extracted Edition: %s" % edition) # Languages for i in record.xpath( ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): languages.append(i.text.strip()) if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) # Put it all together if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata( self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " : ".join( filter(None, [ publisher_location, self.removeSortingCharacters(publisher_name) ])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn', urn) mi.set_identifier('dnb-idn', idn) mi.set_identifier('ddc', ",".join(ddc)) if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 1: if len(subjects_gnd) > 0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd) > 0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: %s" % mi) result_queue.put(mi)
def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get('id') title = entry_.get('title') description = entry_.get('summary') # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get('publisher') isbn = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get('pubdate') authors = entry_.get('author') book_tags = entry_.get('tags') rating = entry_.get('rating') cover_url = entry_.get('images', {}).get('large') series = entry_.get('series') if not authors: authors = [_('Unknown')] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {'douban': douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(''), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag['name'] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating: try: mi.rating = float(rating['average']) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u # Series if series: mi.series = series['title'] return mi
def extract_vol_details(self, vol_url): # Here we extract and format the information from the choosen volume. # - The first name and last name to populate author and author sort : vol_auteur_prenom and vol_auteur_nom # - The title of the volume : vol_title # - The serie name the volume is part of : vol_serie # - The sequence number in the serie : vol_serie_seq # missing # - The editor of this volume : vol_editor # - The editor's collection of this volume : vol_coll # - The collection serial code of this volume : vol_coll_srl # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl # date format to be computed # - The ISBN number assoi-ciated with the volume : vol_isbn # - The volume tags : vol_genre # - The url pointer to the volume cover image : vol_cover_index # - The comments includes various info about the book : vol_comment_soup # . reference, an url pointer to noosfere # . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume # . first edition information # . serie (cycle) name and number # . this volume editor info # . Resume (quatrième de couverture) # . Critiques # . Sommaire detailing what novels are in the volume when it is an anthology # . Critiques about the serie and/or about another volume of the book # debug = self.dbg_lvl & 2 self.log.info(self.who, "\nIn extract_vol_details(soup)") if debug: self.log.info(self.who, "vol_url : ", vol_url) if debug: self.log.info( self.who, "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')" ) self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who) rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who) soup = rsp[0] url_vrai = rsp[1].replace("&Tri=3", "") # if debug: self.log.info(self.who,soup.prettify()) # useful but too big... self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace( '?', '&').replace('=', '&').split('&')[2] # self.nsfr_id = (self.nfsr_id).strip("$") # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ??? tmp = self.nsfr_id self.nsfr_id = tmp.strip('$') if debug: self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) tmp_lst = [] vol_info = {} vol_title = "" vol_auteur = "" vol_auteur_prenom = "" vol_auteur_nom = "" vol_serie = "" vol_serie_seq = "" vol_editor = "" vol_coll = "" vol_coll_srl = "" vol_dp_lgl = "" vol_isbn = "" vol_genre = "" vol_cover_index = "" comment_generic = None comment_resume = None comment_Critiques = None comment_Sommaire = None comment_AutresCritique = None comment_cover = None comment_decoupage_annexe = None # add volume address as a reference in the comment vol_comment_soup = BS( '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai + '</a></p></div>', "lxml") if debug: self.log.info(self.who, "vol reference processed") if soup.select("span[class='TitreNiourf']"): vol_title = soup.select( "span[class='TitreNiourf']")[0].text.strip() if debug: self.log.info(self.who, "vol_title processed : ", vol_title) if soup.select("span[class='AuteurNiourf']"): vol_auteur = soup.select( "span[class='AuteurNiourf']")[0].text.replace("\n", "").strip() if debug: self.log.info(self.who, "vol_auteur processed : ", vol_auteur) for i in range(len(vol_auteur.split())): if not vol_auteur.split()[i].isupper(): vol_auteur_prenom += " " + vol_auteur.split()[i] else: vol_auteur_nom += " " + vol_auteur.split()[i].title() vol_auteur = vol_auteur.title() vol_auteur_prenom = vol_auteur_prenom.strip() if debug: self.log.info(self.who, "vol_auteur_prenom processed : ", vol_auteur_prenom) vol_auteur_nom = vol_auteur_nom.strip() if debug: self.log.info(self.who, "vol_auteur_nom processed : ", vol_auteur_nom) if soup.select("a[href*='serie.asp']"): if soup.select("a[href*='serie.asp']")[0].find_parent( "span", {"class": "ficheNiourf"}): vol_serie = soup.select("a[href*='serie.asp']")[0].text tmp_vss = [ x for x in soup.select("a[href*='serie.asp']") [0].parent.stripped_strings ] for i in range(len(tmp_vss)): if "vol." in tmp_vss[i]: if not vol_serie_seq: vol_serie_seq = tmp_vss[i].replace("vol.", "").strip() if "découpage" in tmp_vss[i]: dec_anx_url = "https://www.noosfere.org/livres/" + soup.select( "a[href*='serie.asp']")[0]['href'] comment_pre_decoupage_annexe = BS( '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>', "lxml") comment_decoupage_annexe = self.get_decoupage_annexe( dec_anx_url) if debug: self.log.info(self.who, "vol_serie, vol_serie_seq processed : ", vol_serie, ",", vol_serie_seq) comment_generic = soup.select("span[class='ficheNiourf']")[0] new_div = soup.new_tag('div') comment_generic = comment_generic.wrap(new_div) if debug: self.log.info(self.who, "comment_generic processed") if soup.select("a[href*='editeur.asp']"): vol_editor = soup.select("a[href*='editeur.asp']")[0].text if debug: self.log.info(self.who, "vol_editor processed : ", vol_editor) if soup.select("a[href*='collection.asp']"): vol_coll = soup.select("a[href*='collection.asp']")[0].text if debug: self.log.info(self.who, "vol_coll : ", vol_coll) for i in comment_generic.stripped_strings: tmp_lst.append(str(i)) vol_coll_srl = tmp_lst[len(tmp_lst) - 1] if "n°" in vol_coll_srl: for k in ["n°", "(", ")"]: if k in vol_coll_srl: vol_coll_srl = vol_coll_srl.replace(k, "") vol_coll_srl = vol_coll_srl.strip() vol_coll_srl = vol_coll_srl.split("/")[0] if vol_coll_srl[0].isnumeric(): vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:] else: vol_coll_srl = "" if debug: self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl) # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead # note that I 'calculate' the missing day of the month and even sometimes the missing month ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre") for elemnt in soup.select_one( "span[class='sousFicheNiourf']").stripped_strings: if debug: self.log.info(self.who, "elemnt : ", elemnt) if not vol_dp_lgl: elemn = (elemnt.replace("Dépôt légal :", "").split(','))[0].strip() if elemn: if elemn.isnumeric() and len(elemn) == 4: vol_dp_lgl = datetime.datetime.strptime( "175 " + elemn, "%j %Y") elif "semestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:] + " " + ele[2], "%j %Y") elif "trimestre" in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:] + " " + ele[2], "%j %Y") else: for i in range(len(ms)): if ms[i] in elemn: ele = elemn.split() vol_dp_lgl = datetime.datetime.strptime( ("000" + str(10 + 31 * i))[-3:] + " " + ele[1], "%j %Y") break if debug: self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl) if "ISBN" in elemnt: vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '') if "néant" in vol_isbn: vol_isbn = "" if debug: self.log.info(self.who, "vol_isbn processed : ", vol_isbn) if "Genre" in elemnt: vol_genre = elemnt.lstrip("Genre : ") if debug: self.log.info(self.who, "vol_genre processed : ", vol_genre) if soup.select("img[name='couverture']"): for elemnt in repr( soup.select("img[name='couverture']")[0]).split('"'): if "http" in elemnt: if not vol_cover_index: vol_cover_index = elemnt if debug: self.log.info(self.who, "vol_cover_index processed : ", vol_cover_index) # add cover image address as a reference in the comment if vol_cover_index: comment_cover = BS( '<div><p>Couverture: <a href="' + vol_cover_index + '">' + vol_cover_index + '</a></p></div>', "lxml") # select the fields I want... More exist such as film adaptations or references to advises to read # but that is not quite consistant around all the books (noosfere is a common database from many people) # and beside I have enough info like that AND I do NOT want to take out the noosfere's business tmp_comm_lst = soup.select("span[class='AuteurNiourf']") if debug: self.log.info(self.who, tmp_comm_lst) #usefull but too long for i in range(len(tmp_comm_lst)): if "Quatrième de couverture" in str(tmp_comm_lst[i]): comment_resume = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_resume processed") if "Critiques" in str(tmp_comm_lst[i]): if not "autres" in str(tmp_comm_lst[i]): comment_Critiques = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Critiques processed") if "Sommaire" in str(tmp_comm_lst[i]): comment_Sommaire = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if debug: self.log.info(self.who, "comment_Sommaire processed") if "Critiques des autres" in str(tmp_comm_lst[i]): comment_AutresCritique = tmp_comm_lst[i].find_parents( "div", {'class': 'sousbloc'})[0] if comment_AutresCritique.select('a[href*="serie.asp"]') and ( "Critique de la série" in comment_AutresCritique. select('a[href*="serie.asp"]')[0].text): critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select( 'a[href*="serie.asp"]')[0]['href'] try: more_comment_AutresCritique = self.get_Critique_de_la_serie( critic_url) comment_AutresCritique.append( more_comment_AutresCritique) except: self.log.exception( "get_Critique_de_la_serie failed for url: ", critic_url) if debug: self.log.info(self.who, "comment_AutresCritique processed") # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-)) if comment_cover: vol_comment_soup.append(comment_cover) if comment_generic: vol_comment_soup.append(comment_generic) if comment_resume: vol_comment_soup.append(comment_resume) if comment_Critiques: vol_comment_soup.append(comment_Critiques) if comment_Sommaire: vol_comment_soup.append(comment_Sommaire) if comment_AutresCritique: vol_comment_soup.append(comment_AutresCritique) if comment_decoupage_annexe: vol_comment_soup.append( comment_pre_decoupage_annexe) # this is the title vol_comment_soup.append(comment_decoupage_annexe) # # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning) # - I like to have functional url when they exist # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) ) for elemnt in vol_comment_soup.select('[align="justify"]'): del elemnt['align'] # remove all double or triple 'br' to improve presentation. # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) ) # # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire... # donc vol_comment_soup est modifié... # tmp1 = tmp2 = "lrp_the_unique" for elemnt in vol_comment_soup.findAll(): tmp1, tmp2 = tmp2, elemnt if tmp1 == tmp2: elemnt.extract() br = soup.new_tag('br') for elemnt in vol_comment_soup.select('.AuteurNiourf'): elemnt.insert(0, br) elemnt["style"] = "font-weight: 600; font-size: 18px" if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet avant correction: ", elemnt) for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/livres/niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='/heberg/']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "/heberg/", "https://www.noosfere.org/heberg/") for elemnt in vol_comment_soup.select( "a[href*='./EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "./niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='heberg']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../../heberg", "https://www.noosfere.org/heberg") for elemnt in vol_comment_soup.select("a[href*='../bd']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "../bd", "https://www.noosfere.org/bd") for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "auteur.asp", "https://www.noosfere.org/livres/auteur.asp") for elemnt in vol_comment_soup.select("a[href*='collection.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "collection.asp", "https://www.noosfere.org/livres/collection.asp") for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "critsign.asp", "https://www.noosfere.org/livres/critsign.asp") for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "EditionsLivre.asp", "https://www.noosfere.org/livres/EditionsLivre.asp") for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editeur.asp", "https://www.noosfere.org/livres/editeur.asp") for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "editionslivre.asp", "https://www.noosfere.org/livres/editionslivre.asp") for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "niourf.asp", "https://www.noosfere.org/livres/niourf.asp") for elemnt in vol_comment_soup.select("a[href*='serie.asp']"): if 'http' not in elemnt.get('href'): elemnt["href"] = elemnt["href"].replace( "serie.asp", "https://www.noosfere.org/livres/serie.asp") if debug: for elemnt in vol_comment_soup.select("a[href*='.asp']"): if 'http' not in elemnt.get('href'): self.log.info(self.who, "url incomplet apres correction: ", elemnt) fg, fd = "<<==", "==>>" #chr(0x21D0),chr(0x21D2) #chr(0x27f8),chr(0x27f9) for elemnt in vol_comment_soup.select("img[src*='arrow_left']"): elemnt.replace_with(fg) for elemnt in vol_comment_soup.select("img[src*='arrow_right']"): elemnt.replace_with(fd) # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €) # only set vol_coll_srl if vol_coll exists # the idea is to use search and replace in the edit Metadata in bulk window. if self.extended_publisher: if debug: self.log.info( self.who, """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set""" ) if vol_coll: if debug: self.log.info(self.who, 'add collection') vol_editor = vol_editor + ('§') + vol_coll if vol_coll_srl: if debug: self.log.info(self.who, 'add collection number') vol_editor = vol_editor + ('€') + vol_coll_srl if vol_serie: if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq) else: vol_serie_seq = 1.0 # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage: # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters # Side note: # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) ) # # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien... # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je # lisais mal et je pensais à une incompatibilité avec la structure xml), # vol_comment_soup = vol_comment_soup.encode('ascii', 'xmlcharrefreplace') self.log.info(self.who, "+++" * 25) self.log.info(self.who, "nsfr_id, type() : ", self.nsfr_id, type(self.nsfr_id)) # must be <class 'str'> self.log.info(self.who, "relevance, type() : ", self.relevance, type(self.relevance)) # must be <class 'float'> self.log.info(self.who, "vol_title, type() : ", vol_title, type(vol_title)) # must be <class 'str'> self.log.info( self.who, "vol_auteur, type() : ", vol_auteur, type(vol_auteur)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_auteur_prenom, type() : ", vol_auteur_prenom, type(vol_auteur_prenom)) # must be <class 'str'> self.log.info(self.who, "vol_auteur_nom, type() : ", vol_auteur_nom, type(vol_auteur_nom)) # must be <class 'str'> if vol_serie: self.log.info(self.who, "vol_serie, type() : ", vol_serie, type(vol_serie)) # must be <class 'str'> self.log.info(self.who, "vol_serie_seq, type() : ", vol_serie_seq, type(vol_serie_seq)) # must be <class 'float'> self.log.info(self.who, "vol_editor, type() : ", vol_editor, type(vol_editor)) # must be <class 'str'> self.log.info(self.who, "vol_coll, type() : ", vol_coll, type(vol_coll)) # must be <class 'str'> self.log.info(self.who, "vol_coll_srl, type() : ", vol_coll_srl, type(vol_coll_srl)) # must be <class 'str'> self.log.info( self.who, "vol_dp_lgl, type() : ", vol_dp_lgl, type(vol_dp_lgl) ) # must be <class 'datetime.datetime'> ('renderer=isoformat') self.log.info(self.who, "vol_isbn, type() : ", vol_isbn, type(vol_isbn)) # must be <class 'str'> self.log.info( self.who, "vol_genre, type() : ", vol_genre, type(vol_genre)) # must be <class 'list'> of <class 'str'> self.log.info(self.who, "vol_cover_index, type() : ", vol_cover_index, type(vol_cover_index)) # must be self.log.info(self.who, "type(vol_comment_soup) : ", type(vol_comment_soup) ) # must be byte encoded (start with b'blablabla... # self.log.info(self.who,"vol_comment_soup :\n",vol_comment_soup) # Maybe a bit long sometimes # language must be <class 'str'> if vol_cover_index: self.plugin.cache_identifier_to_cover_url(self.nsfr_id, vol_cover_index) if vol_isbn: self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id) mi = Metadata(vol_title, [vol_auteur]) mi.set_identifier('nsfr_id', self.nsfr_id) mi.publisher = vol_editor mi.isbn = vol_isbn mi.tags = [vol_genre] mi.source_relevance = self.relevance mi.has_cover = bool(vol_cover_index) if vol_dp_lgl: mi.pubdate = vol_dp_lgl if vol_serie: mi.series = vol_serie mi.series_index = vol_serie_seq mi.language = "fra" mi.comments = vol_comment_soup if debug: self.log.info(self.who, "mi\n", mi, "\n") self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)