def sample_results(self): m1 = Metadata("The Great Gatsby", ["Francis Scott Fitzgerald"]) m2 = Metadata("The Great Gatsby - An extra long title to test resizing", ["F. Scott Fitzgerald"]) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = "Some comments " * 10 m1.tags = ["tag%d" % i for i in range(20)] m1.rating = 4.4 m1.language = "en" m2.language = "fr" m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = "Publisher 1" m2.publisher = "Publisher 2" return [m1, m2]
def sample_results(self): m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald']) m2 = Metadata('The Great Gatsby', ['F. Scott Fitzgerald']) m1.has_cached_cover_url = True m2.has_cached_cover_url = False m1.comments = 'Some comments '*10 m1.tags = ['tag%d'%i for i in range(20)] m1.rating = 4.4 m1.language = 'en' m2.language = 'fr' m1.pubdate = utcnow() m2.pubdate = fromordinal(1000000) m1.publisher = 'Publisher 1' m2.publisher = 'Publisher 2' return [m1, m2]
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') rating = XPath('descendant::gd:rating[@average]') def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google':google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings for x in rating(extra): try: mi.rating = float(x.get('average')) if mi.rating > 5: mi.rating /= 2 except: log.exception('Failed to parse rating') # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): mi.has_google_cover = x.get('href') break return mi
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
#!/usr/bin/env python
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings)/len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool([r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg)/len(avg) ans.average_source_relevance = avg return ans
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def merge(self, results, min_year, do_asr=True): ans = Metadata(_('Unknown')) # We assume the shortest title has the least cruft in it ans.title = self.length_merge('title', results, null_value=ans.title) # No harm in having extra authors, maybe something useful like an # editor or translator ans.authors = self.length_merge('authors', results, null_value=ans.authors, shortest=False) # We assume the shortest publisher has the least cruft in it ans.publisher = self.length_merge('publisher', results, null_value=ans.publisher) # We assume the smallest set of tags has the least cruft in it ans.tags = self.length_merge('tags', results, null_value=ans.tags, shortest=msprefs['fewer_tags']) # We assume the longest series has the most info in it ans.series = self.length_merge('series', results, null_value=ans.series, shortest=False) for r in results: if r.series and r.series == ans.series: ans.series_index = r.series_index break # Average the rating over all sources ratings = [] for r in results: rating = r.rating if rating and rating > 0 and rating <= 5: ratings.append(rating) if ratings: ans.rating = int(round(sum(ratings) / len(ratings))) # Smallest language is likely to be valid ans.language = self.length_merge('language', results, null_value=ans.language) # Choose longest comments ans.comments = self.length_merge('comments', results, null_value=ans.comments, shortest=False) # Published date if min_year: for r in results: year = getattr(r.pubdate, 'year', None) if year == min_year: ans.pubdate = r.pubdate break if getattr(ans.pubdate, 'year', None) == min_year: min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day, tzinfo=utc_tz) else: min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) ans.pubdate = min_date else: min_date = datetime(3001, 1, 1, tzinfo=utc_tz) for r in results: if r.pubdate is not None: candidate = as_utc(r.pubdate) if candidate < min_date: min_date = candidate if min_date.year < 3000: ans.pubdate = min_date # Identifiers for r in results: ans.identifiers.update(r.identifiers) # Cover URL ans.has_cached_cover_url = bool( [r for r in results if getattr(r, 'has_cached_cover_url', False)]) # Merge any other fields with no special handling (random merge) touched_fields = set() for r in results: if hasattr(r, 'identify_plugin'): touched_fields |= r.identify_plugin.touched_fields for f in touched_fields: if f.startswith('identifier:') or not ans.is_null(f): continue setattr(ans, f, self.random_merge(f, results, null_value=getattr(ans, f))) if do_asr: avg = [x.relevance_in_source for x in results] avg = sum(avg) / len(avg) ans.average_source_relevance = avg return ans
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r' % self.url) asin = None if self.testing: import tempfile with tempfile.NamedTemporaryFile(prefix=asin + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r' % self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r' % self.url) else: self.log.warning('Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)