def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def to_metadata(self, log, entry): # {{{ xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon':ozon_id} mi.comments = entry.xpath(xp_template.format('Annotation')) mi.ozon_cover_url = None cover = entry.xpath(xp_template.format('Picture')) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) pub_year = entry.xpath(xp_template.format('Year')) if pub_year: mi.pubdate = toPubdate(log, pub_year) #log.debug('pubdate %s'%mi.pubdate) rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: #'rating', A floating point number between 0 and 10 # OZON raion N of 5, calibre of 10, but there is a bug? in identify mi.rating = float(rating) except: pass rating return mi
def to_metadata_for_single_entry(self, log, ozon_id, title, authors): # {{{ # parsing javascript data from the redirect page mi = Metadata(title, authors) mi.identifiers = {'ozon': ozon_id} return mi
def to_metadata_for_single_entry(self, log, ozon_id, title, authors): # {{{ # parsing javascript data from the redirect page mi = Metadata(title, authors) mi.identifiers = {'ozon': ozon_id} return mi
def to_metadata(self, log, entry): # {{{ xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon': ozon_id} mi.comments = entry.xpath(xp_template.format('Annotation')) mi.ozon_cover_url = None cover = entry.xpath(xp_template.format('Picture')) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) pub_year = entry.xpath(xp_template.format('Year')) if pub_year: mi.pubdate = toPubdate(log, pub_year) #log.debug('pubdate %s'%mi.pubdate) rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: #'rating', A floating point number between 0 and 10 # OZON raion N of 5, calibre of 10, but there is a bug? in identify mi.rating = float(rating) except: pass rating return mi
def to_metadata(self, log, entry): # {{{ title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())')) # log.debug(u'Tile: -----> %s' % title) author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())')) # log.debug(u'Author: -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")') if ozon_id: mi.identifiers = {'ozon':ozon_id} # log.debug(u'ozon_id: -----> %s' % ozon_id) mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') # log.debug(u'cover: -----> %s' % cover) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url) pub_year = None if pub_year: mi.pubdate = toPubdate(log, pub_year) # log.debug('pubdate %s' % mi.pubdate) mi.rating = self.get_rating(entry) # if not mi.rating: # log.debug('No rating found. ozon_id:%s'%ozon_id) return mi
def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def get_metadata_from_detail(self, log, entry, title, authors, identifiers): # {{{ title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())')) # log.debug(u'Tile (from_detail): -----> %s' % title) author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())')) # log.debug(u'Author (from_detail): -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(//link[@rel="canonical"][contains(@href, "/context/detail/id/")][1]/@href), "id/"), "/")') if ozon_id: # log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id) mi.identifiers = {'ozon':ozon_id} mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url (from_detail): -----> %s' % mi.ozon_cover_url) mi.rating = self.get_rating(entry) # log.debug(u'mi.rating (from_detail): -----> %s' % mi.rating) if not mi.rating: log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id) return mi
def get_metadata_from_detail(self, log, entry, title, authors, identifiers): # {{{ title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())')) # log.debug(u'Tile (from_detail): -----> %s' % title) author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())')) # log.debug(u'Author (from_detail): -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")') if ozon_id: # log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id) mi.identifiers = {'ozon':ozon_id} mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url (from_detail): -----> %s' % mi.ozon_cover_url) mi.rating = self.get_rating(entry) # log.debug(u'mi.rating (from_detail): -----> %s' % mi.rating) if not mi.rating: log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id) return mi
def to_metadata(self, log, entry): # {{{ title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())')) # log.debug(u'Tile: -----> %s' % title) author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())')) # log.debug(u'Author: -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")') if ozon_id: mi.identifiers = {'ozon':ozon_id} # log.debug(u'ozon_id: -----> %s' % ozon_id) mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') # log.debug(u'cover: -----> %s' % cover) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url) pub_year = None if pub_year: mi.pubdate = toPubdate(log, pub_year) # log.debug('pubdate %s' % mi.pubdate) mi.rating = self.get_rating(entry) # if not mi.rating: # log.debug('No rating found. ozon_id:%s'%ozon_id) return mi
def parse(self, xml_detail): data = xml_detail.split('\n')[1].split("|") self.log(data) title = data[1] authors = [data[0]] comments = data[13] isbn = data[3] publisher = data[6] pub_date_tmp = data[34].split('-') pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz) if isbn is not None: isbn_tmp = re.sub("-", "", isbn) cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp) else: cover = None if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.publisher = publisher mi.pubdate = pub_date mi.isbn = isbn mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: return None
def result2meta(self, result, prev_identifiers={}): ''' Converts the result dict into Calibre metadata. Note: Source download plugins do not have access to custom columns. ''' title = get_title(result) authors = get_author_list(result) mi = Metadata(title=title, authors=authors) mi.identifiers = update_identifiers(prev_identifiers, result) put_publisher(mi, result) put_language(mi, result) self.put_pubdate(mi, result) put_tags(mi, result) put_journal(mi, result) self.put_series_index(mi, result) comments = "" if prefs['abstract_to_comment'] and 'abstract' in result: comments = "\n\n".join([comments, result['abstract']]) if prefs['query_to_comment']: extra_meta = self.mkComments(result) extra_plus = map(lambda x: "crossref:%s" % x, extra_meta) extra = "\n".join(extra_plus) comments = "\n\n".join([comments, extra]) mi.comments = comments if 'score' in result: mi.source_relevance = 100 - result['score'] else: mi.source_relevance = 100 # self.log.info("set comment to %s"%mi.comments) return mi
def default_mi(self): from calibre.ebooks.metadata.book.base import Metadata mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')]) mi.series = _('A series of samples') mi.series_index = 4 mi.tags = [_('Tag One'), _('Tag Two')] mi.publisher = _('Some publisher') mi.rating = 4 mi.identifiers = {'isbn':'123456789', 'url': 'http://calibre-ebook.com'} mi.languages = ['eng', 'fra'] mi.pubdate = mi.timestamp = now() return mi
def default_mi(self): from calibre.ebooks.metadata.book.base import Metadata mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')]) mi.series = _('A series of samples') mi.series_index = 4 mi.tags = [_('Tag One'), _('Tag Two')] mi.publisher = _('Some publisher') mi.rating = 4 mi.identifiers = {'isbn':'123456789', 'url': 'https://calibre-ebook.com'} mi.languages = ['eng', 'fra'] mi.pubdate = mi.timestamp = now() return mi
def parse(self, xml_detail): sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None authors = [] tags = [] xpath = self.XPath('//table[@id="record"]//tr') for row in xpath(xml_detail): ch = row.getchildren() txt = ch[0].text.strip() data = self.normalize(ch[1].text) if txt.startswith('245') and title is None: title = self.parse_title(data) if txt.startswith('246'): title = self.parse_title(data) elif txt.startswith('100') or txt.startswith('700'): res = self.parse_author(data) if res is not None: authors.append(res) elif txt == 'SYS': sys_ident = data.strip() elif txt =='020': isbn = self.parse_isbn(data) elif txt == '260': publisher, pub_year = self.parse_publisher(data) elif txt.startswith('490') and serie is None: serie, serie_index = self.parse_serie(data) elif txt == '655 7': tags.append(self.parse_tags(data)) if isbn is not None and isbn != '': cover = self.parse_cover(isbn) if title is not None and len(authors) > 0 and sys_ident is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.identifiers = {self.plugin.name:sys_ident} mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(sys_ident, cover) return mi else: self.log('Data not found') return None
def to_metadata(self, log, entry): # {{{ title = unicode( entry.xpath( u'normalize-space(.//div[@itemprop="name"][1]/text())')) # log.debug(u'Title: -----> %s' % title) author = unicode( entry.xpath( u'normalize-space(.//div[contains(@class, "mPerson")])')) # log.debug(u'Author: -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.get('data-href').split('/')[-2] if ozon_id: mi.identifiers = {'ozon': ozon_id} # log.debug(u'ozon_id: -----> %s' % ozon_id) mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') log.debug(u'cover: -----> %s' % cover) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url) pub_year = None pub_year_block = entry.xpath( u'.//div[@class="bOneTileProperty"]/text()') year_pattern = re.compile('\d{4}') if pub_year_block: pub_year = re.search(year_pattern, pub_year_block[0]) if pub_year: mi.pubdate = toPubdate(log, pub_year.group()) # log.debug('pubdate %s' % mi.pubdate) mi.rating = self.get_rating(log, entry) # if not mi.rating: # log.debug('No rating found. ozon_id:%s'%ozon_id) return mi
def to_metadata(self, browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars # log.info('entry_ is: ',entry_) id_url = entry_['url'] douban_id = entry_['id'] title_ = entry_['title'] subtitle = entry_['subtitle'] authors = [x.strip() for x in entry_['author'] if x] if not authors: authors = [_('Unknown')] mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} mi.comments = entry_['summary'] mi.publisher = entry_['publisher'] # ISBN mi.isbn = entry_['isbn10'] mi.all_isbns = [entry_['isbn10'], entry_['isbn13']] # Tags mi.tags = [x['name'].strip() for x in entry_['tags']] # pubdate pubdate = entry_['pubdate'] if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings mi.rating = float(entry_['rating']['average']) / 2.0 # Cover mi.has_douban_cover = entry_['image'] return mi
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.series = serie mi.series_index = serie_index return mi else: return None
def to_metadata(self, log, entry): # {{{ title = unicode(entry.xpath(u'normalize-space(.//div[@itemprop="name"][1]/text())')) # log.debug(u'Title: -----> %s' % title) author = unicode(entry.xpath(u'normalize-space(.//div[contains(@class, "mPerson")])')) # log.debug(u'Author: -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.get('data-href').split('/')[-2] if ozon_id: mi.identifiers = {'ozon': ozon_id} # log.debug(u'ozon_id: -----> %s' % ozon_id) mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') log.debug(u'cover: -----> %s' % cover) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url) pub_year = None pub_year_block = entry.xpath(u'.//div[@class="bOneTileProperty"]/text()') year_pattern = re.compile('\d{4}') if pub_year_block: pub_year = re.search(year_pattern, pub_year_block[0]) if pub_year: mi.pubdate = toPubdate(log, pub_year.group()) # log.debug('pubdate %s' % mi.pubdate) mi.rating = self.get_rating(log, entry) # if not mi.rating: # log.debug('No rating found. ozon_id:%s'%ozon_id) return mi
def start(self, title, authors, identifiers): book = Metadata(title, authors) book.identifiers = identifiers self.covers_widget.start(book, self.current_cover, title, authors, {}) return self.exec_()
def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get('id') title = entry_.get('title') description = entry_.get('summary') # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get('publisher') isbn = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get('pubdate') authors = entry_.get('author') book_tags = entry_.get('tags') rating = entry_.get('rating') cover_url = entry_.get('images', {}).get('large') series = entry_.get('series') if not authors: authors = [_('Unknown')] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {'douban': douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(''), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag['name'] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating: try: mi.rating = float(rating['average']) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u # Series if series: mi.series = series['title'] return mi
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') rating = XPath('descendant::gd:rating[@average]') def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google':google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings for x in rating(extra): try: mi.rating = float(x.get('average')) if mi.rating > 5: mi.rating /= 2 except: log.exception('Failed to parse rating') # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): mi.has_google_cover = x.get('href') break return mi
def start(self, title, authors, identifiers): book = Metadata(title, authors) book.identifiers = identifiers self.covers_widget.start(book, self.current_cover, title, authors, {}) return self.exec_()
def parse(self, raw, desc_raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, utcnow import json root = parse_html(raw.decode('gb18030')) title = root.xpath('//*[@id="name"]/div[1]/text()') title = title[0].strip() authors = [] for i in root.xpath('//*[@id="p-author"]/a'): authors.append(i.text.strip()) mi = Metadata(title, authors) information = root.xpath('//*[@id="parameter2"]/li') info = dict() for i in information: tmp = etree.tostring(i, method='text', encoding='utf-8').split(u':') info[tmp[0].strip()] = tmp[1].strip() # Identifiers mi.identifiers = self.plugin.identifiers mi.identifiers['jd'] = self.sku isbn = info['ISBN'] self.log.error(isbn) if isbn: mi.isbn = isbn self.plugin.cache_isbn_to_identifier(isbn, self.sku) mi.identifiers['isbn'] = isbn # Publisher mi.publisher = info.get(u'出版社') # Pubdate pubdate = info.get(u'出版时间') if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: self.log.error('Failed to parse pubdate %r' % pubdate) # Series mi.series = info.get(u'丛书名') img = root.xpath('//*[@id="spec-n1"]/img') cover = img[0].get('src') if cover: if not cover.startswith('http'): cover = 'https:' + cover self.plugin.cache_identifier_to_cover_url(self.sku, cover) self.log.error(cover) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None # Comments # showdesc({"date":1583588455348,"content":" ... "}) try: desc = json.loads(desc_raw[9:-1].decode('gb18030')) desc_root = parse_html(desc['content']) div = desc_root.xpath( '//*[@id="detail-tag-id-3"]/div[2]/div/text()') comments = div[0] mi.comments = comments finally: return mi
def retrieve_bokelai_detail(self, bokelai_id, log, result_queue, timeout): detail_url = self.BOKELAI_DETAIL_URL % bokelai_id log.info(detail_url) try: br = self.browser _raw = br.open_novisit(detail_url, timeout=timeout) raw = _raw.read() except Exception as e: log.exception('Failed to load detail page: %s' % detail_url) return root = etree.HTML(raw) info_json_text = root.xpath( "//script[@type='application/ld+json']")[0].text log.info(info_json_text) info_json = json.loads(info_json_text) title = info_json['name'] authors = info_json['author'][0]['name'].split(",") publisher = info_json['publisher'][0]['name'] isbn = info_json['workExample']['workExample']['isbn'] pubdate = info_json['datePublished'] comments = "" comments_ele = root.xpath("(//div[@class='content'])[1]//text()") comments = "\n".join(comments_ele) tags = list() for ele in root.xpath("//li[contains(text(),'本書分類:')]/a"): log.info(ele.text) if "/" in ele.text: tags.extend(ele.text.split("/")) if "/" in ele.text: tags.extend(ele.text.split("/")) else: tags.append(ele.text) cover_url = re.search(r'https[^\?\=\&]*' + bokelai_id + r'[^\?\=\&]*', info_json['image']).group(0) if not authors: authors = [_('Unknown')] log.info(title, authors, publisher, isbn, pubdate, comments, tags, cover_url) mi = Metadata(title, authors) mi.identifiers = {'bokelai': bokelai_id, 'isbn': isbn} mi.publisher = publisher mi.comments = comments mi.isbn = isbn mi.tags = tags if pubdate: try: from calibre.utils.date import parse_date, utcnow default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) if not cover_url is None: mi.has_bokelai_cover = cover_url self.cache_identifier_to_cover_url(mi.identifiers['bokelai'], mi.has_bokelai_cover) else: mi.has_bokelai_cover = None result_queue.put(mi)
def to_metadata(log, gmetadata, ExHentai_Status): # {{{ title = gmetadata['title'] title_jpn = gmetadata['title_jpn'] tags = gmetadata['tags'] rating = gmetadata['rating'] category = gmetadata['category'] gid = gmetadata['gid'] token = gmetadata['token'] thumb = gmetadata['thumb'] # title if title_jpn: raw_title = title_jpn else: raw_title = title pat1 = re.compile( r'(?P<comments>.*?\[(?P<author>(?:(?!汉化|漢化)[^\[\]])*)\](?:\s*(?:\[[^\(\)]+\]|\([^\[\]\(\)]+\))\s*)*(?P<title>[^\[\]\(\)]+).*)' ) if re.findall(pat1, raw_title): m = re.search(pat1, raw_title) title_ = m.group('title').strip() author = m.group('author').strip() else: title_ = raw_title.strip() author = 'Unknown' log.exception('Title match failed. Title is %s' % raw_title) authors = [(author)] mi = Metadata(title_, authors) mi.identifiers = { 'ehentai': '%s_%s_%d' % (str(gid), str(token), int(ExHentai_Status)) } # publisher pat2 = re.compile(r'^\(([^\[\]\(\)]*)\)') if re.findall(pat2, raw_title): publisher = re.search(pat2, raw_title).group(1).strip() mi.publisher = publisher else: mi.publisher = 'Unknown' log.exception('Not Found publisher.') # Tags tags_ = [] for tag in tags: if re.match('language', tag): tag_ = re.sub('language:', '', tag) if tag_ != 'translated': mi.language = tag_ else: tags_.append(tag_) # elif re.match('parody|group|character|artist', tag): # log('drop tag %s' % tag) # continue elif not ':' in tag: log('drop tag %s' % tag) continue else: tags_.append(tag) tags_.append(category) mi.tags = tags_ # rating mi.rating = float(rating) # cover mi.has_ehentai_cover = None if thumb: mi.has_ehentai_cover = thumb return mi
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text.replace('http://', 'https://') douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban':douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi
def to_metadata(self, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow log.info("to_metadata") douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") # authors = "author" book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("cover") series = entry_.get("series") if not authors: authors = [("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = book_tags # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except BaseException: log.error("Failed to parse pubdate %r" % pubdate) if rating: try: # mi.publisher += "#PrB.rating#" + str(rating) mi.rating = rating / 2.0 except BaseException: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series return mi
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi