def get_metadata(self, md, select): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title, md.author_sort, select) if not book: return None mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) #logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def _metadata(self, book): authors = [] if book['author']: for author in book['author']: for r in REMOVES: author = r.sub("", author) authors.append(author) if not authors: authors = [u'佚名'] from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO mi = Metadata(book['title']) mi.authors = authors mi.author_sort = mi.authors[0] mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) mi.website = "https://book.douban.com/isbn/%s" % mi.isbn mi.source = u'豆瓣' mi.cover_url = book['images']['large'] if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) logging.debug("=================\ndouban metadata:\n%s" % mi) return mi
def to_metadata(self, log, entry): # {{{ xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon':ozon_id} mi.comments = entry.xpath(xp_template.format('Annotation')) mi.ozon_cover_url = None cover = entry.xpath(xp_template.format('Picture')) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) pub_year = entry.xpath(xp_template.format('Year')) if pub_year: mi.pubdate = toPubdate(log, pub_year) #log.debug('pubdate %s'%mi.pubdate) rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: #'rating', A floating point number between 0 and 10 # OZON raion N of 5, calibre of 10, but there is a bug? in identify mi.rating = float(rating) except: pass rating return mi
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id""" issue = pycomicvine.Issue( issue_id, field_list=[ "id", "name", "volume", "issue_number", "person_credits", "description", "store_date", "cover_date", ], ) if not issue or not issue.volume: log.warn("Unable to load Issue(%d)" % issue_id) return None title = "%s #%s" % (issue.volume.name, issue.issue_number) if issue.name: title = title + ": %s" % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier("comicvine", str(issue.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def parse(self, xml_detail): data = xml_detail.split('\n')[1].split("|") self.log(data) title = data[1] authors = [data[0]] comments = data[13] isbn = data[3] publisher = data[6] pub_date_tmp = data[34].split('-') pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz) if isbn is not None: isbn_tmp = re.sub("-", "", isbn) cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp) else: cover = None if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.publisher = publisher mi.pubdate = pub_date mi.isbn = isbn mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: return None
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def get_metadata(self, md): book = None if md.isbn: book = self.get_book_by_isbn(md.isbn) if not book: book = self.get_book_by_title(md.title) mi = Metadata(book['title']) mi.authors = book['author'] mi.author_sort = mi.authors[0] if mi.authors else None if mi.author_sort: for r in REMOVES: mi.author_sort = r.sub("", mi.author_sort) mi.authors[0] = mi.author_sort mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [ t['name'] for t in book['tags'] ][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) img_url = book['images']['large'] img_fmt = img_url.split(".")[-1] img = StringIO(urlopen(img_url).read()) mi.cover_data = (img_fmt, img) logging.error("=================\ndouban metadata:\n%s" % mi) return mi
def to_metadata(self, log, entry): # {{{ xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon': ozon_id} mi.comments = entry.xpath(xp_template.format('Annotation')) mi.ozon_cover_url = None cover = entry.xpath(xp_template.format('Picture')) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) pub_year = entry.xpath(xp_template.format('Year')) if pub_year: mi.pubdate = toPubdate(log, pub_year) #log.debug('pubdate %s'%mi.pubdate) rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: #'rating', A floating point number between 0 and 10 # OZON raion N of 5, calibre of 10, but there is a bug? in identify mi.rating = float(rating) except: pass rating return mi
def get_baike_metadata(self, title): from baidubaike import Page try: baike = Page(title) except: return None info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = info.get(u'首发网站', None) if not plat: plat = info.get(u'首发状态', "网络小说平台") plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info[u'作者']] mi.isbn = '0000000000001' mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.comments = baike.get_summary() if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [ info.get(u'作者', u'佚名') ] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary() ) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def result2meta(self, result, prev_identifiers={}): ''' Converts the result dict into Calibre metadata. Note: Source download plugins do not have access to custom columns. ''' title = get_title(result) authors = get_author_list(result) mi = Metadata(title=title, authors=authors) mi.identifiers = update_identifiers(prev_identifiers, result) put_publisher(mi, result) put_language(mi, result) self.put_pubdate(mi, result) put_tags(mi, result) put_journal(mi, result) self.put_series_index(mi, result) comments = "" if prefs['abstract_to_comment'] and 'abstract' in result: comments = "\n\n".join([comments, result['abstract']]) if prefs['query_to_comment']: extra_meta = self.mkComments(result) extra_plus = map(lambda x: "crossref:%s" % x, extra_meta) extra = "\n".join(extra_plus) comments = "\n\n".join([comments, extra]) mi.comments = comments if 'score' in result: mi.source_relevance = 100 - result['score'] else: mi.source_relevance = 100 # self.log.info("set comment to %s"%mi.comments) return mi
def test_rtf_metadata(self): stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}') m = Metadata('Test ø̄title', ['Author One', 'Author БTwo']) m.tags = 'tag1 見tag2'.split() m.comments = '<p>some ⊹comments</p>' m.publisher = 'publiSher' set_metadata(stream, m) stream.seek(0) o = get_metadata(stream) for attr in 'title authors publisher comments tags'.split(): self.assertEqual(getattr(m, attr), getattr(o, attr))
def test_rtf_metadata(self): stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}') m = Metadata('Test ø̄title', ['Author One', 'Author БTwo']) m.tags = 'tag1 見tag2'.split() m.comments = '<p>some ⊹comments</p>' m.publisher = 'publiSher' set_metadata(stream, m) stream.seek(0) o = get_metadata(stream) for attr in 'title authors publisher comments tags'.split(): self.assertEqual(getattr(m, attr), getattr(o, attr))
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in iteritems(identifiers): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in iteritems((read_user_metadata(root, prefixes, refines) or {})): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item( root, prefixes, refines) return ans
def _get_bookdetails(self, url): u = self.BASE_URL + url["url"] print("_get_bookdetails:: traukiam knygą iš %s" % u) resp = urllib2.urlopen(u) contents = resp.read() #print(contents) tree = etree.HTML(contents) authors = self._get_authors(tree) publisher = self._get_details(tree, self.details_publisher) year = self._get_year(tree) pages = self._get_details(tree, self.details_pages) isbn = self._get_details(tree, self.details_isbn) description = self._get_description(tree) cover = self._get_cover_url(tree) tags = self._get_tags(tree) mi = Metadata(url["title"], authors) mi.set_identifier("isbn", isbn) mi.comments = description mi.language = "LT" mi.tags = tags try: mi.set("publisher", publisher) except: print(u"_get_bookdetails:: nepavyko užsetinti leidėjo") try: mi.set("pubdate", datetime.datetime(year, 1, 2)) except: print(u"_get_bookdetails:: nepavyko užsetinti leidimo datos") try: if self.gui: print("YYYYRAAA GUI!!!") col = {} col["#value#"] = pages mi.set_user_metadata("#count", col) except: print(u"_get_bookdetails:: nepavyko užsetinti puslapių skaičiaus") if cover and isbn: print(u"_get_bookdetails:: kešuojam viršelį:", cover) self.cache_isbn_to_identifier(isbn, isbn) self.cache_identifier_to_cover_url(isbn, cover) mi.has_cover = True print(self.cached_identifier_to_cover_url(isbn)) return mi
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems(): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) return ans
def sample_results(self): m1 = Metadata("The Great Gatsby", ["Francis Scott Fitzgerald"]) m2 = Metadata("The Great Gatsby - An extra long title to test resizing", ["F. Scott Fitzgerald"]) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = "Some comments " * 10 m1.tags = ["tag%d" % i for i in range(20)] m1.rating = 4.4 m1.language = "en" m2.language = "fr" m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = "Publisher 1" m2.publisher = "Publisher 2" return [m1, m2]
def sample_results(self): m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald']) m2 = Metadata('The Great Gatsby - An extra long title to test resizing', ['F. Scott Fitzgerald']) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = 'Some comments '*10 m1.tags = ['tag%d'%i for i in range(20)] m1.rating = 4.4 m1.language = 'en' m2.language = 'fr' m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = 'Publisher 1' m2.publisher = 'Publisher 2' return [m1, m2]
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id.""" issue = PyComicvineWrapper(log).lookup_issue(issue_id) if issue: meta = Metadata(issue.get_full_title(), issue.get_authors()) meta.series = issue.volume_name meta.series_index = issue.issue_number meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume_id)) meta.comments = issue.description meta.has_cover = False meta.publisher = issue.publisher_name meta.pubdate = issue.date return meta else: return None
def sample_results(self): m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald']) m2 = Metadata('The Great Gatsby', ['F. Scott Fitzgerald']) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = 'Some comments '*10 m1.tags = ['tag%d'%i for i in range(20)] m1.rating = 4.4 m1.language = 'en' m2.language = 'fr' m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = 'Publisher 1' m2.publisher = 'Publisher 2' return [m1, m2]
def parse_response(cls, response, isbn_initial, log): metadata_items = [] page_soup = BeautifulSoup(response.text) for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1): title = cls.find(candidate, 'b-result__name-wrap', True) author = map( unicode.strip, cls.find(candidate, 'b-result__author', True).split(',')) comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip() isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip() log.info(u'Found candidate %s: %s' % (idx, title)) publisher = None pubdate = None other_info = cls.find(candidate, 'b-result__years', True).strip() if other_info: for entry in other_info.split(';'): k, v = entry.split(':', 1) k = k.strip() if k == u'Год': pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip()) elif k == u'Издательство': publisher = v.strip() metadata_item = Metadata(title, author) metadata_item.isbn = isbn or isbn_initial if comments: metadata_item.comments = comments if publisher is not None: metadata_item.publisher = publisher if pubdate is not None: metadata_item.pubdate = pubdate metadata_items.append(metadata_item) return metadata_items
def test_input_comment_single(self): stream_meta = get_metadata(self.get_stream('comment_single')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;' canon_meta.tags = ['tag d'] canon_meta.set_identifiers({ 'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre' }) self.compare_metadata(stream_meta, canon_meta)
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index( item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def to_metadata(self, browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars # log.info('entry_ is: ',entry_) id_url = entry_['url'] douban_id = entry_['id'] title_ = entry_['title'] subtitle = entry_['subtitle'] authors = [x.strip() for x in entry_['author'] if x] if not authors: authors = [_('Unknown')] mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} mi.comments = entry_['summary'] mi.publisher = entry_['publisher'] # ISBN mi.isbn = entry_['isbn10'] mi.all_isbns = [entry_['isbn10'], entry_['isbn13']] # Tags mi.tags = [x['name'].strip() for x in entry_['tags']] # pubdate pubdate = entry_['pubdate'] if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings mi.rating = float(entry_['rating']['average']) / 2.0 # Cover mi.has_douban_cover = entry_['image'] return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data[ 'pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def test_input_meta_multi(self): stream_meta = get_metadata(self.get_stream('meta_multi')) canon_meta = Metadata( 'A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English', 'Spanish'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) canon_meta.rating = float(8) canon_meta.comments = 'meta "comments" ♥ HTML &amp;' canon_meta.tags = ['tag a', 'tag b', 'tag c'] canon_meta.set_identifiers({ 'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre' }) self.compare_metadata(stream_meta, canon_meta)
def parse_response(cls, response, isbn_initial, log): metadata_items = [] page_soup = BeautifulSoup(response.text) for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1): title = cls.find(candidate, 'b-result__name-wrap', True) author = map(unicode.strip, cls.find(candidate, 'b-result__author', True).split(',')) comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip() isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip() log.info(u'Found candidate %s: %s' % (idx, title)) publisher = None pubdate = None other_info = cls.find(candidate, 'b-result__years', True).strip() if other_info: for entry in other_info.split(';'): k, v = entry.split(':', 1) k = k.strip() if k == u'Год': pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip()) elif k == u'Издательство': publisher = v.strip() metadata_item = Metadata(title, author) metadata_item.isbn = isbn or isbn_initial if comments: metadata_item.comments = comments if publisher is not None: metadata_item.publisher = publisher if pubdate is not None: metadata_item.pubdate = pubdate metadata_items.append(metadata_item) return metadata_items
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.series = serie mi.series_index = serie_index return mi else: return None
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata info = baike.get_info() logging.debug("\n".join("%s:\t%s" % v for v in info.items())) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' mi.provider_key = KEY mi.provider_value = baike.get_id() if self.copy_image and mi.cover_url: logging.debug("fetching cover: %s", mi.cover_url) img = io.BytesIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def build_meta(log, issue_id): '''Build metadata record based on comicvine issue_id''' issue = pycomicvine.Issue(issue_id, field_list=[ 'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 'store_date', 'cover_date']) if not issue or not issue.volume: log.warn('Unable to load Issue(%d)' % issue_id) return None title = '%s #%s' % (issue.volume.name, issue.issue_number) if issue.name: title = title + ': %s' % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join("%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "網絡小說平台" plat = info.get(u'首發狀態', plat) plat = info.get(u'首發網站', plat) plat = plat.replace(u'首發', '') mi.publisher = info.get(u'連載平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完結' in info.get(u'連載狀態', ""): day = re.findall('\d*-\d*-\d*', info[u'連載狀態']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def parse_details(self, root): isfdb_id = None title = None authors = [] isbn = None publisher = None pubdate = None try: isfdb_id = re.search('(\d+)$', self.url).groups(0)[0] except: self.log.exception('Error parsing ISFDB ID for url: %r' % self.url) detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li') if not detail_nodes: detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image) for detail_node in detail_nodes: section = detail_node[0].text_content().strip().rstrip(':') #self.log.info(section) try: if section == 'Publication': title = detail_node[0].tail.strip() if not title: # assume an extra span with a transliterated title tooltip title = detail_node[1].text_content().strip() #self.log.info(title) elif section == 'Authors' or section == 'Editors': for a in detail_node.xpath('.//a'): author = a.text_content().strip() if section.startswith('Editors'): authors.append(author + ' (Editor)') else: authors.append(author) #self.log.info(authors) elif section == 'ISBN': isbn = detail_node[0].tail.strip('[] \n') #self.log.info(isbn) elif section == 'Publisher': publisher = detail_node.xpath('a')[0].text_content().strip() #self.log.info(publisher) elif section == 'Date': pubdate = self._convert_date_text(detail_node[0].tail.strip()) #self.log.info(pubdate) except: self.log.exception('Error parsing section %r for url: %r' % (section, self.url) ) if not title or not authors or not isfdb_id: self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url) self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('isfdb', isfdb_id) self.isfdb_id = isfdb_id if isbn: self.isbn = mi.isbn = isbn if publisher: mi.publisher = publisher if pubdate: mi.pubdate = pubdate try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! mi.source_relevance = self.relevance if self.isfdb_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): #解析元数据各字段数据 #self.log.info("=====") try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r' % self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(asin or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) # 分析取得书名 try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None #分析取得作者 try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return #以书名,作者为元数据对象mi,用于设置元数据 mi = Metadata(title, authors) #设置Bookid idtype = '17k' mi.set_identifier(idtype, asin) self.k17k_id = asin #设备注释(简介) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) #设置丛书系列 try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) #设置标签 try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) #设置最后更新日期 # try: # mi.last_modified = self.parse_last_modified(root) # except: # self.log.exception('Error parsing last_modified for url: %r'%self.url) #设置封面 try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) mi.source_relevance = self.relevance mi.languages = [ u'中文', ] if self.k17k_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.k17k_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
#!/usr/bin/env python
def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers): from lxml import etree def tostring(x): if x is None: return '' return etree.tostring(x, method='text', encoding=unicode).strip() orig_isbn = identifiers.get('isbn', None) title_tokens = list(self.get_title_tokens(orig_title)) author_tokens = list(self.get_author_tokens(orig_authors)) results = [] def ismatch(title, authors): authors = lower(' '.join(authors)) title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break amatch = not author_tokens for a in author_tokens: if lower(a) in authors: amatch = True break if not author_tokens: amatch = True return match and amatch bl = feed.find('BookList') if bl is None: err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) for bd in bl.xpath('.//BookData'): isbn = check_isbn(bd.get('isbn', None)) isbn13 = check_isbn(bd.get('isbn13', None)) if not isbn and not isbn13: continue if orig_isbn and orig_isbn not in {isbn, isbn13}: continue title = tostring(bd.find('Title')) if not title: continue authors = [] for au in bd.xpath('.//Authors/Person'): au = tostring(au) if au: if ',' in au: ln, _, fn = au.partition(',') au = fn.strip() + ' ' + ln.strip() authors.append(au) if not authors: continue comments = tostring(bd.find('Summary')) id_ = (title, tuple(authors)) if id_ in seen: continue seen.add(id_) if not ismatch(title, authors): continue publisher = tostring(bd.find('PublisherText')) if not publisher: publisher = None if publisher and 'audio' in publisher.lower(): continue mi = Metadata(title, authors) mi.isbn = isbn mi.publisher = publisher mi.comments = comments results.append(mi) return total_results, shown_results, results
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text.replace('http://', 'https://') douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban':douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r'%pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get('id') title = entry_.get('title') description = entry_.get('summary') # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get('publisher') isbn = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get('pubdate') authors = entry_.get('author') book_tags = entry_.get('tags') rating = entry_.get('rating') cover_url = entry_.get('images', {}).get('large') series = entry_.get('series') if not authors: authors = [_('Unknown')] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {'douban': douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(''), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag['name'] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating: try: mi.rating = float(rating['average']) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u # Series if series: mi.series = series['title'] return mi
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF') if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:'+x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def parse_details(self, raw, root): dang_id = parse_dang_id(root, self.log, self.url) if not dang_id and root.xpath( '//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.' ) if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(dang_id or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', dang_id, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not dang_id: self.log.error('Could not find title/authors/dang_id for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (dang_id, title, authors)) return mi = Metadata(title, authors) idtype = 'dang' mi.set_identifier(idtype, dang_id) self.dang_id = dang_id try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) pd = root.xpath(self.pd_desc_xpath) pd_info = root.xpath(self.pd_info_xpath) pd_info_store = root.xpath(self.pd_info_store_xpath) pd_desc = root.xpath(self.pd_desc_xpath) if pd_info or pd_info_store: try: isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) if pd_info: pd_info = pd_info[0] else: pd_info = pd_info_store[0] try: mi.publisher = self.parse_publisher(pd_info) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd_info) except: self.log.exception('Error parsing publish date for url: %r' % self.url) else: self.log.warning('Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.dang_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.dang_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers): from lxml import etree def tostring(x): if x is None: return '' return etree.tostring(x, method='text', encoding=unicode).strip() orig_isbn = identifiers.get('isbn', None) title_tokens = list(self.get_title_tokens(orig_title)) author_tokens = list(self.get_author_tokens(orig_authors)) results = [] def ismatch(title, authors): authors = lower(' '.join(authors)) title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break amatch = not author_tokens for a in author_tokens: if lower(a) in authors: amatch = True break if not author_tokens: amatch = True return match and amatch bl = feed.find('BookList') if bl is None: err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) for bd in bl.xpath('.//BookData'): isbn = check_isbn(bd.get('isbn', None)) isbn13 = check_isbn(bd.get('isbn13', None)) if not isbn and not isbn13: continue if orig_isbn and orig_isbn not in {isbn, isbn13}: continue title = tostring(bd.find('Title')) if not title: continue authors = [] for au in bd.xpath('.//Authors/Person'): au = tostring(au) if au: if ',' in au: ln, _, fn = au.partition(',') au = fn.strip() + ' ' + ln.strip() authors.append(au) if not authors: continue comments = tostring(bd.find('Summary')) id_ = (title, tuple(authors)) if id_ in seen: continue seen.add(id_) if not ismatch(title, authors): continue publisher = tostring(bd.find('PublisherText')) if not publisher: publisher = None if publisher and 'audio' in publisher.lower(): continue mi = Metadata(title, authors) mi.isbn = isbn mi.publisher = publisher mi.comments = comments results.append(mi) return total_results, shown_results, results
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') rating = XPath('descendant::gd:rating[@average]') def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google':google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings for x in rating(extra): try: mi.rating = float(x.get('average')) if mi.rating > 5: mi.rating /= 2 except: log.exception('Failed to parse rating') # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): mi.has_google_cover = x.get('href') break return mi
def parse_details(self, root): search_data = '' isbn = None try: self.log.info('Parse details:%s'%self.url) databazeknih_id = self.parse_databazeknih_id(self.url) self.log.info('Parsed DK identifier:%s'%databazeknih_id) except: self.log.exception('Error parsing databazeknih id for url: %r'%self.url) databazeknih_id = None # self.log.info('11') try: title = self.parse_title(root) self.log.info('Parsed title:%s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors:%s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not databazeknih_id: self.log.error('Could not find title/authors/databazeknih id for %r'%self.url) self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors)) return mi = Metadata(title, authors) self.log.info('dbki:%s'%databazeknih_id) mi.set_identifier('databazeknih', databazeknih_id) self.databazeknih_id = databazeknih_id try: (mi.series, mi.series_index) = self.parse_series(root) self.log.info('Parsed series:%s'%mi.series) self.log.info('Parsed series index:%s'%mi.series_index) except : self.log.exception('Error parsing series for url: %r'%self.url) series = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments:%s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover:%r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags:%s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher:%s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(root) self.log.info('Parsed pubdate:%s'%mi.pubdate) except: self.log.exception('Error parsing pubdate for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating:%s'%mi.rating) except: self.log.exception('Error parsing rating for url: %r'%self.url) mi.source_relevance = self.relevance # if series: # mi.series = series try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) if self.databazeknih_id: self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id) # self.plugin.clean_downloaded_metadata(mi) # mi.isbn = check_isbn(mi.isbn) self.log.info(mi) self.result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [ re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',') ] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r' % pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError( 'Corrupted XMP metadata packet detected, probably generated by Nitro PDF' ) mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = [au for aus in authors for au in string_to_authors(aus)] tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = list(filter(None, map(canonicalize_lang, languages))) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in iteritems({ 'doi': check_doi, 'isbn': check_isbn }): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def parse_details(self, root): try: goodreads_id = self.parse_goodreads_id(self.url) except: self.log.exception("Error parsing goodreads id for url: %r" % self.url) goodreads_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception("Error parsing title and series for url: %r" % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception("Error parsing authors for url: %r" % self.url) authors = [] if not title or not authors or not goodreads_id: self.log.error("Could not find title/authors/goodreads id for %r" % self.url) self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier("goodreads", goodreads_id) self.goodreads_id = goodreads_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception("Error parsing ISBN for url: %r" % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception("Error parsing ratings for url: %r" % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception("Error parsing comments for url: %r" % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception("Error parsing cover for url: %r" % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception("Error parsing tags for url: %r" % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception("Error parsing publisher and date for url: %r" % self.url) mi.source_relevance = self.relevance if self.goodreads_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)