class Cbdb(Source): NAMESPACES={ 'x':"http://www.w3.org/1999/xhtml" } ''' List of platforms this plugin works on For example: ['windows', 'osx', 'linux'] ''' supported_platforms = ['windows', 'osx', 'linux'] BASE_URL = 'http://www.cbdb.cz/' BASE_DETAIL_URL = 'http://www.cbdb.cz/kniha-' ''' The name of this plugin. You must set it something other than Trivial Plugin for it to work. ''' name = 'cbdb' ''' The version of this plugin as a 3-tuple (major, minor, revision) ''' version = (1, 0, 4) ''' A short string describing what this plugin does ''' description = u'Download metadata and cover from cbdb.cz' ''' The author of this plugin ''' author = u'MarDuke [email protected]' ''' When more than one plugin exists for a filetype, the plugins are run in order of decreasing priority i.e. plugins with higher priority will be run first. The highest possible priority is sys.maxint. Default priority is 1. ''' priority = 1 ''' The earliest version of calibre this plugin requires ''' minimum_calibre_version = (1, 0, 0) ''' If False, the user will not be able to disable this plugin. Use with care. ''' can_be_disabled = True ''' Set of capabilities supported by this plugin. Useful capabilities are: ‘identify’, ‘cover’ ''' capabilities = frozenset(['identify', 'cover']) ''' List of metadata fields that can potentially be download by this plugin during the identify phase ''' touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:cbdb', 'languages']) ''' Set this to True if your plugin returns HTML formatted comments ''' has_html_comments = False ''' Setting this to True means that the browser object will add Accept-Encoding: gzip to all requests. This can speedup downloads but make sure that the source actually supports gzip transfer encoding correctly first ''' supports_gzip_transfer_encoding = False ''' Cached cover URLs can sometimes be unreliable (i.e. the download could fail or the returned image could be bogus. If that is often the case with this source set to False ''' cached_cover_url_is_reliable = True ''' A list of Option objects. They will be used to automatically construct the configuration widget for this plugin ''' options = ( Option('max_search', 'number', 25, 'Maximum knih', 'Maximum knih které se budou zkoumat jestli vyhovují hledaným parametrům'), Option('max_covers', 'number', 5, 'Maximum obálek', 'Maximum obálek které se budou stahovat'), Option('serie_index', 'bool', True, 'Pozice v sérii', 'Cbdb neudává pozici v sérii, pouze vypisuje seznam knih v sérii ve správném pořadí, takže pokud některá např. chybí jsou pozice rozhozené, je zde možnost tuto nespolehlivou vlastnost vypnout. Stále se ovšem bude zobrazovat alespoň informace o názvu série'), ) ''' A string that is displayed at the top of the config widget for this plugin ''' config_help_message = None ''' If True this source can return multiple covers for a given query ''' can_get_multiple_covers = True ''' If set to True covers downloaded by this plugin are automatically trimmed. ''' auto_trim_covers = False def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): ''' Identify a book by its title/author/isbn/etc. If identifiers(s) are specified and no match is found and this metadata source does not store all related identifiers (for example, all ISBNs of a book), this method should retry with just the title and author (assuming they were specified). If this metadata source also provides covers, the URL to the cover should be cached so that a subsequent call to the get covers API with the same ISBN/special identifier does not need to get the cover URL again. Use the caching API for this. Every Metadata object put into result_queue by this method must have a source_relevance attribute that is an integer indicating the order in which the results were returned by the metadata source for this query. This integer will be used by compare_identify_results(). If the order is unimportant, set it to zero for every result. Make sure that any cover/isbn mapping information is cached before the Metadata object is put into result_queue. Parameters: log – A log object, use it to output debugging information/errors result_queue – A result Queue, results should be put into it. Each result is a Metadata object abort – If abort.is_set() returns True, abort further processing and return as soon as possible title – The title of the book, can be None authors – A list of authors of the book, can be None identifiers – A dictionary of other identifiers, most commonly {‘isbn’:‘1234...’} timeout – Timeout in seconds, no network request should hang for longer than timeout. Returns: None if no errors occurred, otherwise a unicode representation of the error suitable for showing to the user ''' self.log = Log(self.name, log) found = [] xml = None detail_ident = None #test previous found first ident = identifiers.get(self.name, None) XPath = partial(etree.XPath, namespaces=self.NAMESPACES) entry = XPath('//div[@id="search_result_box_books"]//table[@class="search_graphic"][1]') detail_test = XPath('//td[@id="book_photo_box"]/img/@src') query = self.create_query(title=title, authors=authors, identifiers=identifiers) if not query: self.log('Insufficient metadata to construct query') return br = self.browser try: self.log('download page search %s'%query) raw = br.open(query, timeout=timeout).read().strip() def fixHtml(obj): return obj.group().replace('&','&') raw = re.sub('&.{3}[^;]', fixHtml, raw) raw = raw.decode('utf-8', errors='replace') self.log.filelog(raw, 'D:\\tmp\\cbdb.html') except Exception as e: self.log.exception('Failed to make identify query: %r'%query) return as_unicode(e) try: parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(raw) feed = fromstring(clean, parser=parser) # for error in parser.error_log: # self.log(error.message) entries = entry(feed) if len(entries) == 0: xml = feed detail_detect = detail_test(feed)[0] detail_ident = int(re.findall('\d+', detail_detect)[0]) if ident is not None and detail_ident != ident: found.append(ident) else: self.log('Found %i matches'%len(entries)) #self.log('Matches %s'%(entries)) act_authors = [] for act in authors: act_authors.append(act.split(" ")[-1]) # ident_found = False tmp_entries = [] for book_ref in entries: title_tag = book_ref.xpath(".//div/a[starts-with(@href, 'kniha-')]", namespaces=self.NAMESPACES) auths = [] #authors surnames authors_tag = book_ref.xpath(".//a[@class='search_author_link']", namespaces=self.NAMESPACES) for i in (authors_tag): auths.append(i.text.split(" ")[-1]) add = (title_tag[0].get('href'), title_tag[0].text, auths) tmp_entries.append(add) if len(tmp_entries) > self.prefs['max_search']: tmp_entries.sort(key=self.prefilter_compare_gen(title=title, authors=act_authors)) tmp_entries = tmp_entries[:self.prefs['max_search']] for val in tmp_entries: found.append(val[0]) except Exception as e: self.log.exception('Failed to parse identify results') return as_unicode(e) if ident and found.count(ident) > 0: found.remove(ident) found.insert(0, ident) try: workers = [] #if redirect push to worker actual parsed xml, no need to download and parse it again if xml is not None: workers = [Worker(detail_ident, result_queue, br, log, 0, self, xml)] workers += [Worker(ident, result_queue, br, log, i, self, None) for i, ident in enumerate(found)] for w in workers: w.start() time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break except Exception as e: self.log.exception(e) return None def create_query(self, title=None, authors=None, identifiers={}): ''' create url for HTTP request ''' from urllib import urlencode q = '' if title: q += ' '.join(self.get_title_tokens(title)) if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None return self.BASE_URL+'hledat?'+urlencode({ 'text':q }) def get_cached_cover_url(self, identifiers): ''' Return cached cover URL for the book identified by the identifiers dict or None if no such URL exists. Note that this method must only return validated URLs, i.e. not URLS that could result in a generic cover image or a not found error. ''' url = None ident = identifiers.get(self.name, None) if ident is not None: url = self.cached_identifier_to_cover_url(ident) return url def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): ''' Download a cover and put it into result_queue. The parameters all have the same meaning as for identify(). Put (self, cover_data) into result_queue. This method should use cached cover URLs for efficiency whenever possible. When cached data is not present, most plugins simply call identify and use its results. If the parameter get_best_cover is True and this plugin can get multiple covers, it should only get the “best” one. ''' self.log = Log(self.name, log) cached_urls = self.get_cached_cover_url(identifiers) if not title: return if not cached_urls: self.log('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title, authors, identifiers, timeout) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_urls = self.get_cached_cover_url(mi.identifiers) if cached_urls is not None: break if cached_urls is None: log.info('No cover found') return self.log("Covers:%s"%cached_urls) if abort.is_set(): return self.download_multiple_covers(title, authors, cached_urls, get_best_cover, timeout, result_queue, abort, log) def get_book_url(self, identifiers): ''' Return a 3-tuple or None. The 3-tuple is of the form: (identifier_type, identifier_value, URL). The URL is the URL for the book identified by identifiers at this source. identifier_type, identifier_value specify the identifier corresponding to the URL. This URL must be browseable to by a human using a browser. It is meant to provide a clickable link for the user to easily visit the books page at this source. If no URL is found, return None. This method must be quick, and consistent, so only implement it if it is possible to construct the URL from a known scheme given identifiers. ''' ident = identifiers.get(self.name, None) if ident: return (self.name, ident, "%skniha-%s"%(self.BASE_URL,ident)) else: return None def get_book_url_name(self, idtype, idval, url): ''' Return a human readable name from the return value of get_book_url(). ''' return self.name def identify_results_keygen(self, title=None, authors=None, identifiers={}): ''' Return a function that is used to generate a key that can sort Metadata objects by their relevance given a search query (title, authors, identifiers). These keys are used to sort the results of a call to :meth:`identify`. For details on the default algorithm see :class:`InternalMetadataCompareKeyGen`. Re-implement this function in your plugin if the default algorithm is not suitable. ''' def keygen(mi): return MetadataCompareKeyGen(mi, self, title, authors, identifiers) return keygen def prefilter_compare_gen(self, title=None, authors=None): ''' Return a function that used to preOrdering if ser get more results than we want to check. Filtering should found most relevant results based on title and authors ''' def keygen(data): return PreFilterMetadataCompare(data, self, title, authors) return keygen
class Worker(Thread): #string id ident = None #int id number = None def __init__(self, ident, result_queue, browser, log, relevance, plugin, xml, timeout=20): Thread.__init__(self) self.daemon = True self.ident, self.result_queue = ident, result_queue self.browser = browser.clone_browser() self.relevance = relevance self.plugin, self.timeout = plugin, timeout self.cover_url = self.isbn = None self.XPath = partial(etree.XPath, namespaces=plugin.NAMESPACES) self.xml = xml if xml is not None: self.number = int(ident) else: if ident.startswith('kniha-'): self.number = int(self.ident.split('-')[1]) else: self.number = int(ident) self.log = Log("worker %i"%self.number, log) def initXPath(self): self.xpath_title = self.XPath('//*[@itemprop="name"]/text()') self.xpath_authors = self.XPath('//a[@itemprop="author"]/text()') self.xpath_comments = self.XPath('//p[@itemprop="about"]') self.xpath_rating = self.XPath('//div[@id="item_rating"]/text()') self.xpath_isbn = self.XPath('//span[@itemprop="isbn"]/text()') self.xpath_publisher = self.XPath('//div[@class="book_info_line"]/a[starts-with(@href, "nakladatelstvi-")]/text()') self.xpath_pub_date = self.XPath('//div[@class="book_info_line"]/a[starts-with(@href, "nakladatelstvi-")]/following-sibling::text()[1]') self.xpath_tags = self.XPath('//span[@itemprop="genre"]/text()') self.xpath_serie = self.XPath('//a[@href="?show=serie"]/text()') self.xpath_serie_index = self.XPath('//a[@href="?show=serie"]/preceding-sibling::text()') self.xpath_cover = self.XPath('//div[@id="book_covers_control"]/@onclick') def run(self): self.initXPath() if self.xml is not None: xml_detail = self.xml else: xml_detail = self.download_detail() if xml_detail is not None: try: result = self.parse(xml_detail) if result: self.result_queue.put(result) except Exception as e: self.log.exception(e) else: self.log('Download metadata failed for: %r'%self.ident) def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher, pub_year = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail, title) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: self.log('Result skipped for because title or authors not found') return None def parse_title(self, xml_detail): tmp = self.xpath_title(xml_detail) if len(tmp) > 0: res = unicode(tmp[0]) self.log('Found title:%s'%res) return res else: self.log('Found title:None') return None def parse_authors(self, xml_detail): tmp = self.xpath_authors(xml_detail) if len(tmp) > 0: self.log('Found authors:%s'%tmp) auths = [] for author in tmp: auths.append(unicode(author)) return auths else: self.log('Found authors:None') return None def parse_comments(self, xml_detail): tmp = self.xpath_comments(xml_detail) if len(tmp) > 0: #result = "".join(tmp[0].text).strip() result = unicode(tmp[0].text).strip() self.log('Found comment:%s'%result) return result else: self.log('Found comment:None') return None def parse_rating(self, xml_detail): tmp = self.xpath_rating(xml_detail) if len(tmp) > 0: rating = float(int(tmp[0].replace('%','')) / 20) self.log('Found rating:%s'%rating) return rating else: self.log('Found rating:None') return None def parse_isbn(self, xml_detail): tmp = self.xpath_isbn(xml_detail) if len(tmp) > 0: self.log('Found ISBN:%s'%tmp[0].strip()) return tmp[0].strip() else: self.log('Found ISBN:None') return None def parse_publisher(self, xml_detail): tmp = self.xpath_publisher(xml_detail) tmpDate = self.xpath_pub_date(xml_detail) if len(tmp) > 0: publisher = tmp[0] pubDate = self.prepare_date(int(re.search('(\d+)', tmpDate[0]).group(0))) self.log('Found publisher:%s'%publisher) self.log('Found pub date:%s'%pubDate) return [publisher, pubDate] self.log('Found publisher:None') self.log('Found pub date:None') return (None, None) def parse_tags(self, xml_detail): tmp = self.xpath_tags(xml_detail) if len(tmp) > 0: result = tmp self.log('Found tags:%s'%result) return result else: self.log('Found tags:None') return None def parse_serie(self, xml_detail, title): tmp = self.xpath_serie(xml_detail) if len(tmp) == 0: self.log('Found serie:None') return [None, None] index = 0 if self.plugin.prefs['serie_index']: tmpIndex = self.xpath_serie_index(xml_detail) index = int(re.search('(\d+)', tmpIndex[0]).group(0)) self.log('Found serie:%s[%i]'%(tmp[0],index)) return [tmp[0], index] def parse_cover(self, xml_detail): tmp = self.xpath_cover(xml_detail) result = [] if len(tmp) > 0: nums = re.findall('\d+', tmp[0]) ident = int(nums[0]) num_add = int(nums[1]) result.append(self.plugin.BASE_URL + 'books/%i.jpg'%ident) cnt = num_add covers = self.plugin.prefs['max_covers'] if covers: if cnt > covers: cnt = covers for n in range(1,cnt): result.append(self.plugin.BASE_URL + 'books/%i_%i.jpg'%(ident, n)) if len(result) > 0: self.log('Found covers:%s'%result) else: self.log('Found covers:None') return result def download_detail(self): query = self.plugin.BASE_DETAIL_URL + self.ident br = self.browser try: self.log('download page detail %s'%query) data = br.open(query, timeout=self.timeout).read().strip() parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(data) self.log.filelog(clean, 'D:\\tmp\\file' + self.ident +'.html') xml = fromstring(clean, parser=parser) # for error in parser.error_log: # self.log(error.message) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query) return None def prepare_date(self,year): from calibre.utils.date import utc_tz return datetime.datetime(year, 1, 1, tzinfo=utc_tz)
class Worker(Thread): #string id ident = None #int id number = None def __init__(self, ident, result_queue, browser, log, relevance, plugin, xml, timeout=20): Thread.__init__(self) self.daemon = True self.ident, self.result_queue = ident, result_queue self.browser = browser.clone_browser() self.relevance = relevance self.plugin, self.timeout = plugin, timeout self.cover_url = self.isbn = None self.XPath = partial(etree.XPath, namespaces=plugin.NAMESPACES) self.xml = xml self.log = Log("worker %s"%ident, log) def initXPath(self): self.xpath_title = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Název:"]/following::td[1]/text()' self.xpath_authors = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Autor:"]/following::td[1]/a/text()' self.xpath_comments = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Další informace:"]/following::td[1]/text()' self.xpath_stars = '//input[@id="rating"]/@value' self.xpath_isbn = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "ISBN:"]/following::td[1]/text()' self.xpath_publisher = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Nakladatel (rok vydání):"]/following::td[1]//text()' self.xpath_tags = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Žánry a lit. útvary:"]/following::td[1]/a/text()' self.xpath_edition = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Edice:"]/following::td[1]//text()' self.xpath_serie = '//table[@class="detail_table"]//td[@class="detail_td_item_name" and text() = "Série:"]/following::td[1]//text()' self.xpath_cover = '//td[@class="detail_td_item_value"]/img/@src' def run(self): self.initXPath() if self.xml is not None: xml_detail = self.xml else: xml_detail = self.download_detail() if xml_detail is not None: try: result = self.parse(xml_detail) if result: self.result_queue.put(result) except Exception as e: self.log.exception(e) else: self.log.exception('Download metadata failed for: %s'%self.ident) def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher, pub_year = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: return None def parse_title(self, xml_detail): tmp = xml_detail.xpath(self.xpath_title) if len(tmp) > 0: self.log('Found title:%s'%tmp[0].strip()) return tmp[0].strip() else: self.log('Found title:None') return None def parse_authors(self, xml_detail): tmp = xml_detail.xpath(self.xpath_authors) if len(tmp) > 0: auths = [] for a in tmp: self.log(a) parts = a.split(",") self.log(parts) auths.append("%s %s"%(parts[1].strip(),parts[0])) self.log('Found authors:%s'%auths) return auths else: self.log('Found authors:None') return None def parse_comments(self, xml_detail): tmp = xml_detail.xpath(self.xpath_comments) if len(tmp) > 0: result = "<br/>".join(tmp).strip() if self.plugin.prefs['edition_to_comments'] != 'Nepřidávat': tmp = xml_detail.xpath(self.xpath_edition) if len(tmp) > 0: num_tmp = int(re.search("\d+", tmp[1]).group()) add = "Edice: %s %d. díl"%(tmp[0], num_tmp) if self.plugin.prefs['edition_to_comments'] != 'Na konec': result = result + "<br/>" + add elif self.plugin.prefs['edition_to_comments'] != 'Na začatek': result = add + "<br/>" + result self.log('Found comment:%s'%result) return result else: self.log('Found comment:None') return None def parse_rating(self, xml_detail): tmp = xml_detail.xpath(self.xpath_stars) if len(tmp) > 0: rating = float(tmp[0]) self.log('Found rating:%s'%rating) return rating+1 else: self.log('Found rating:None') return None def parse_isbn(self, xml_detail): tmp = xml_detail.xpath(self.xpath_isbn) if len(tmp) > 0: self.log('Found ISBN:%s'%tmp[0]) return tmp[0] else: self.log('Found ISBN:None') return None def parse_publisher(self, xml_detail): tmp = xml_detail.xpath(self.xpath_publisher) if len(tmp) > 0: pub = tmp[0] pubdt = int(tmp[1].strip()[1:-1]) self.log('Found publisher:%s'%pub) self.log('Found pubdate:%s'%pubdt) return [pub, datetime.datetime(pubdt, 1, 1, tzinfo=utc_tz)] else: self.log('Found publisher:None') self.log('Found pubdate:None') return (None, None) def parse_tags(self, xml_detail): tags = [] tags.extend(xml_detail.xpath(self.xpath_tags)) if self.plugin.prefs['edition']: tmp = xml_detail.xpath(self.xpath_edition) if len(tmp) > 1: tags.append(self.plugin.prefs['edition_prefix'] + tmp[0]) if len(tags) > 0: self.log('Found tags:%s'%tags) return tags else: self.log('Found tags:None') return None def parse_serie(self, xml_detail): tmp = xml_detail.xpath(self.xpath_serie) if len(tmp) > 1: serie = tmp[0] serie_index = int(tmp[1].split('-')[1].strip()) self.log('Found serie:%s[%d]'%(serie, serie_index)) return [serie, serie_index] else: self.log('Found serie:None') return [None, None] def parse_cover(self, xml_detail): tmp = xml_detail.xpath(self.xpath_cover) if len(tmp) > 0: cover = "%snew/%s"%(self.plugin.BASE_URL,tmp[0]) self.log('Found covers:%s'%cover) return cover else: self.log('Found covers:None') def download_detail(self): query = "%snew/?mainpage=pub&subpage=detail&id=%s"%(self.plugin.BASE_URL, self.ident) br = self.browser try: self.log('download page detail %s'%query) data = br.open(query, timeout=self.timeout).read().strip() parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(data) xml = fromstring(clean, parser=parser) self.log.filelog(clean, "\\tmp\\worker-%s.html"%self.ident) return xml except Exception as e: self.log.exception('Failed to make download : %r'%query) return None