def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def _metadata(self, book): authors = [] if book['author']: for author in book['author']: for r in REMOVES: author = r.sub("", author) authors.append(author) if not authors: authors = [u'佚名'] from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO mi = Metadata(book['title']) mi.authors = authors mi.author_sort = mi.authors[0] mi.publisher = book['publisher'] mi.comments = book['summary'] mi.isbn = book.get('isbn13', None) mi.tags = [t['name'] for t in book['tags']][:8] mi.rating = int(float(book['rating']['average'])) mi.pubdate = self.str2date(book['pubdate']) mi.timestamp = datetime.datetime.now() mi.douban_id = book['id'] mi.douban_author_intro = book['author_intro'] mi.douban_subtitle = book.get('subtitle', None) mi.website = "https://book.douban.com/isbn/%s" % mi.isbn mi.source = u'豆瓣' mi.cover_url = book['images']['large'] if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) logging.debug("=================\ndouban metadata:\n%s" % mi) return mi
def parse(self, xml_detail): data = xml_detail.split('\n')[1].split("|") self.log(data) title = data[1] authors = [data[0]] comments = data[13] isbn = data[3] publisher = data[6] pub_date_tmp = data[34].split('-') pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz) if isbn is not None: isbn_tmp = re.sub("-", "", isbn) cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp) else: cover = None if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.publisher = publisher mi.pubdate = pub_date mi.isbn = isbn mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: return None
def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join( "%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [ info.get(u'作者', u'佚名') ] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary() ) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def parse(self, xml_detail): sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None authors = [] tags = [] xpath = self.XPath('//table[@id="record"]//tr') for row in xpath(xml_detail): ch = row.getchildren() txt = ch[0].text.strip() data = self.normalize(ch[1].text) if txt.startswith('245') and title is None: title = self.parse_title(data) if txt.startswith('246'): title = self.parse_title(data) elif txt.startswith('100') or txt.startswith('700'): res = self.parse_author(data) if res is not None: authors.append(res) elif txt == 'SYS': sys_ident = data.strip() elif txt =='020': isbn = self.parse_isbn(data) elif txt == '260': publisher, pub_year = self.parse_publisher(data) elif txt.startswith('490') and serie is None: serie, serie_index = self.parse_serie(data) elif txt == '655 7': tags.append(self.parse_tags(data)) if isbn is not None and isbn != '': cover = self.parse_cover(isbn) if title is not None and len(authors) > 0 and sys_ident is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.identifiers = {self.plugin.name:sys_ident} mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(sys_ident, cover) return mi else: self.log('Data not found') return None
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata info = baike.get_info() logging.debug("\n".join("%s:\t%s" % v for v in info.items())) mi = Metadata(info['title']) plat = "网络小说平台" plat = info.get(u'首发状态', plat) plat = info.get(u'首发网站', plat) plat = plat.replace(u'首发', '') mi.publisher = info.get(u'连载平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' mi.provider_key = KEY mi.provider_value = baike.get_id() if self.copy_image and mi.cover_url: logging.debug("fetching cover: %s", mi.cover_url) img = io.BytesIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完结' in info.get(u'连载状态', ""): day = re.findall('\d*-\d*-\d*', info[u'连载状态']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
def _metadata(self, baike): from calibre.ebooks.metadata.book.base import Metadata from cStringIO import StringIO info = baike.get_info() print "\n".join("%s:\t%s" % v for v in info.items()) mi = Metadata(info['title']) plat = "網絡小說平台" plat = info.get(u'首發狀態', plat) plat = info.get(u'首發網站', plat) plat = plat.replace(u'首發', '') mi.publisher = info.get(u'連載平台', plat) mi.authors = [info.get(u'作者', u'佚名')] mi.author_sort = mi.authors[0] mi.isbn = BAIKE_ISBN mi.tags = baike.get_tags() mi.pubdate = datetime.datetime.now() mi.timestamp = datetime.datetime.now() mi.cover_url = baike.get_image() mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary()) mi.website = baike.http.url mi.source = u'百度百科' if self.copy_image: img = StringIO(urlopen(mi.cover_url).read()) img_fmt = mi.cover_url.split(".")[-1] mi.cover_data = (img_fmt, img) if u'完結' in info.get(u'連載狀態', ""): day = re.findall('\d*-\d*-\d*', info[u'連載狀態']) try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d') except: pass return mi
#!/usr/bin/env python
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [ _.get('content') for _ in node if _.get('property') == property ][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath( '//meta[starts-with(@property, "og") or starts-with(@property, "books")]' ) # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath( '//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([ _ + u'(역자)' for _ in _format_list(book_info['translator']['name']) ]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url( ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: isbn = self.extract_isbn(self.url) except: self.log.exception('No ISBN in URL: %r'%self.url) isbn = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not isbn: self.log.error('Could not find title/authors/Aladin id for %r'%self.url) self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.set_identifier('isbn', isbn) mi.isbn = isbn self.isbn = isbn # ISBN-13 try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! if mi.has_cover: self.log.info('Cover URL: '+mi.cover_url) try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the title of the book try: title_node = root.xpath('//span[@itemprop="name"]') self.title = title_node[0].text except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath('//span[@class="expandAuthorName"]') author_strings = author_node[0].text.split(",") #print(author_strings) for name in author_strings: self.authors.append(name) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Get the series of the book try: series_node = root.xpath('//b[contains(text(), "Serie")]/a') if len(series_node) > 0: self.series = series_node[0].text.split(": ")[0].strip() self.series_index = series_node[0].text.split(": ")[-1].strip() # print("'%s'" % self.series) # print("'%s'" % self.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) # Some books have ratings, let's use them. try: self.rating = 0.0 except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: isbn_node = root.xpath( '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]' ) if len(isbn_node) > 0: self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip() except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: comment_node = root.xpath('//meta[@name="description"]/@content') self.comments = comment_node[0] except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: cover_node = root.xpath( '//div[@class="bookDetailCoverCover"]/img/@src') self.cover_url = "https://mofibo.com" + cover_node[0] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: publisher_node = root.xpath( '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]') if len(publisher_node) > 0: self.publisher = publisher_node[0].text except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language_node = root.xpath('//b[@class="expanderLanguage"]') language = language_node[0].text.strip().replace("Sprog:", "").replace( " ", "") language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: pubdate_node = root.xpath( '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]' ) if len(pubdate_node) > 0: date_str = pubdate_node[0].text.replace("Udgivet:", "").strip() format_str = '%Y-%m-%d' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Get the tags try: tags = [] tags_node = root.xpath('//span[@itemprop="category"]') tags.append(tags_node[0].text.strip()) self.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('mofibo', self.url) # Set rating if self.series: try: meta_data.series = self.series meta_data.series_index = self.series_index except: self.log.exception('Error loading series') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Set tags data if self.tags: try: meta_data.tags = self.tags except: self.log.exception('Error loading tags') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() self.log.info(raw) except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for biblionet timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = json.loads(raw) self.log.info(root) except: msg = 'Failed to parse book detail page: %r' % self.url self.log.exception(msg) return try: self.biblionetid = root['biblionetid'] except: self.log.exception('Error parsing book id for url: %r' % self.url) self.biblionetid = None try: self.title = root['title'].strip() except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = [root['authors'].strip()] self.log.info(self.authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None try: self.cover_url = root['cover_url'] self.log.info('Parsed URL for cover:%r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.biblionetid, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) try: self.publisher = root['publisher'] self.log.info('Parsed publisher:%s' % self.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: self.tags = root['categories'].replace('DDC: ', 'DDC:').replace( '-', '').split()[:-1] self.log.info('Parsed tags:%s' % self.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.pubdate = root['yr_published'] self.log.info('Parsed publication date:%s' % self.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) mi = Metadata(self.title, self.authors) mi.set_identifier('biblionet', self.biblionetid) if self.series_index: try: mi.series_index = float(self.series_index) except: self.log.exception('Error loading series') if self.relevance: try: mi.source_relevance = self.relevance except: self.log.exception('Error loading relevance') if self.cover_url: try: mi.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') if self.publisher: try: mi.publisher = self.publisher except: self.log.exception('Error loading publisher') if self.tags: try: mi.tags = self.tags except: self.log.exception('Error loading tags') if self.pubdate: try: if self.pubdate not in (self.yr_msg1, self.yr_msg2): d = datetime.date(int(self.pubdate), 1, 1) mi.pubdate = d except: self.log.exception('Error loading pubdate') self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): isfdb_id = None title = None authors = [] isbn = None publisher = None pubdate = None try: isfdb_id = re.search('(\d+)$', self.url).groups(0)[0] except: self.log.exception('Error parsing ISFDB ID for url: %r' % self.url) detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li') if not detail_nodes: detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image) for detail_node in detail_nodes: section = detail_node[0].text_content().strip().rstrip(':') #self.log.info(section) try: if section == 'Publication': title = detail_node[0].tail.strip() if not title: # assume an extra span with a transliterated title tooltip title = detail_node[1].text_content().strip() #self.log.info(title) elif section == 'Authors' or section == 'Editors': for a in detail_node.xpath('.//a'): author = a.text_content().strip() if section.startswith('Editors'): authors.append(author + ' (Editor)') else: authors.append(author) #self.log.info(authors) elif section == 'ISBN': isbn = detail_node[0].tail.strip('[] \n') #self.log.info(isbn) elif section == 'Publisher': publisher = detail_node.xpath('a')[0].text_content().strip() #self.log.info(publisher) elif section == 'Date': pubdate = self._convert_date_text(detail_node[0].tail.strip()) #self.log.info(pubdate) except: self.log.exception('Error parsing section %r for url: %r' % (section, self.url) ) if not title or not authors or not isfdb_id: self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url) self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('isfdb', isfdb_id) self.isfdb_id = isfdb_id if isbn: self.isbn = mi.isbn = isbn if publisher: mi.publisher = publisher if pubdate: mi.pubdate = pubdate try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! mi.source_relevance = self.relevance if self.isfdb_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the json data within the HTML code (some stuff is easier to get with json) try: json_raw = root.xpath('(//script[@type="application/ld+json"])[2]') json_root = json.loads(json_raw[0].text.strip()) #print(json.dumps(json_root, indent=4, sort_keys=True)) except: self.log.error("Error loading JSON data") return # Get the title of the book try: self.title = json_root['name'] except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath( '//h2[@class="product-page-heading__autor"]//a') for name in author_node: self.authors.append(name.text.strip()) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Some books have ratings, let's use them. try: self.rating = float(json_root['aggregateRating']['ratingValue']) except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: self.isbn = json_root['isbn'] except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: self.comments = parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: self.cover_url = json_root['image'] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: self.publisher = json_root['publisher']['name'] except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language = json_root['inLanguage']['name'] language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy pubdate_node = root.xpath( '//div[@class="product-page-block__container"]//dd' ) # Format dd-mm-yyyy date_str = pubdate_node[0].text.strip() format_str = '%d-%m-%Y' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('saxo', self.url) # Set rating if self.rating: try: meta_data.rating = self.rating except: self.log.exception('Error loading rating') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)