def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def get_metadata(stream, cover=True): with TemporaryDirectory("_pdf_metadata_read") as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, "src.pdf"), "wb") as f: shutil.copyfileobj(stream, f) try: res = fork_job("calibre.ebooks.metadata.pdf", "read_info", (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError("Failed to run pdfinfo") info = res["result"] with open(res["stdout_stderr"], "rb") as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError("Could not read info dict from PDF") covpath = os.path.join(pdfpath, "cover.jpg") cdata = None if cover and os.path.exists(covpath): with open(covpath, "rb") as f: cdata = f.read() title = info.get("Title", None) au = info.get("Author", None) if au is None: au = [_("Unknown")] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get("Creator", None) if creator: mi.book_producer = creator keywords = info.get("Keywords", None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(",")] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get("Subject", None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ("jpeg", cdata) return mi
def create_query(self, log, title=None, authors=None, identifiers={}): isbn = check_isbn(identifiers.get('isbn', None)) q = '' url = '' if isbn is not None: q = '&isbn=' + isbn url = '/search/search.nhn?serviceSm=advbook.basic&ic=service.summary' + q elif title or authors: title_tokens = list( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) author_tokens = self.get_author_tokens(authors, only_first_author=True) tokens = [ quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in title_tokens ] tokens += [ quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in author_tokens ] q += '&query=' + '+'.join(tokens) url = '/search/search.nhn?sm=sta_hty.book' + q if not url: return None log.info('Search from %s' % (url)) return NaverBook.BASE_URL + url
def create_query(self, log, title=None, authors=None, identifiers={}): isbn = check_isbn(identifiers.get('isbn', None)) url = '' if title or authors: title_tokens = list( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) author_tokens = self.get_author_tokens(authors, only_first_author=True) tokens = [ quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in title_tokens ] tokens += [ quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in author_tokens ] url = '/search/?q=' + '+'.join(tokens) if not url: return None log.info('Search from %s' % (url)) return RidiBooks.BASE_URL + url
def _create_query(self, log, title=None, authors=None, identifiers={}): """ Generates the search url to use to find the book """ isbn = check_isbn(identifiers.get('isbn', None)) q = [] if isbn is not None: # do isbn search q.append('Isbn=' + isbn) if title or authors: # do title and or author based search # tokenize the author and title fields from the current metadata title_tokens = list(self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) author_tokens = self.get_author_tokens(authors, only_first_author=True) # sanitize the title and author info before sending title_tokens = [quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in title_tokens] author_tokens = [quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in author_tokens] # build the query from the tokens if len(title_tokens): q.append("Title={0}".format('+'.join(title_tokens))) if len(author_tokens): q.append("Author={0}".format('+'.join(author_tokens))) q = '&'.join(q) if not q: return None if isinstance(q, unicode): q = q.encode('utf-8') return Shelfari.BASE_URL + '/search/books?' + q
def get_cached_cover_url(self, identifiers): url = None isbn = check_isbn(identifiers.get('isbn', None)) if isbn is None: return None url = self.COVERURL % isbn return url
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers=None, timeout=30): log.debug(u'Bookradar identification started ...') identifiers = identifiers or {} search_tokens = [] if title: search_tokens += list(self.get_title_tokens(title)) if authors: search_tokens += list(self.get_author_tokens(authors, only_first_author=True)) isbn = check_isbn(identifiers.get('isbn', None)) if isbn: search_tokens += (isbn,) search_str = ' '.join(search_tokens) url = self.url_pattern % search_str log.info(u'Searching for: %s' % search_str) try: response = requests.get(url, timeout=timeout) except requests.exceptions.RequestException as e: log.exception('Failed to get data from `%s`: %s' % (url, e.message)) return as_unicode(e) if abort.is_set(): return metadata = self.parse_response(response, isbn_initial=isbn, log=log) for result in metadata: self.clean_downloaded_metadata(result) result_queue.put(result)
def create_query2(self, log, title=None, authors=None, identifiers={}): ''' The edelweiss advanced search appears to be broken, use the keyword search instead, until it is fixed. ''' from urllib import urlencode BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?' params = { 'group': 'search', 'section': 'CatalogOverview', 'searchType': 1, 'searchOrgID': '', 'searchCatalogID': '', 'searchMailingID': '', 'searchSelect': 1, } keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title or authors: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None params['keywords'] = (' '.join(keywords)).encode('utf-8') return BASE_URL + urlencode(params)
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ req_isbn = identifiers.get("isbn", None) for mi in metadata: if abort.is_set(): break try: ozon_id = mi.identifiers["ozon"] try: self.get_book_details(log, mi, timeout) except: log.exception("Failed to get details for metadata: %s" % mi.title) all_isbns = getattr(mi, "all_isbns", []) if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns: log.debug("skipped, no requested ISBN %s found" % req_isbn) continue for isbn in all_isbns: self.cache_isbn_to_identifier(isbn, ozon_id) if mi.ozon_cover_url: self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) self.clean_downloaded_metadata(mi) result_queue.put(mi) except: log.exception("Failed to get details for metadata: %s" % mi.title)
def create_query2(self, log, title=None, authors=None, identifiers={}): ''' The edelweiss advanced search appears to be broken, use the keyword search instead, until it is fixed. ''' from urllib import urlencode BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?' params = { 'group':'search', 'section':'CatalogOverview', 'searchType':1, 'searchOrgID':'', 'searchCatalogID': '', 'searchMailingID': '', 'searchSelect':1, } keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title or authors: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None params['keywords'] = (' '.join(keywords)).encode('utf-8') return BASE_URL+urlencode(params)
def parse_new_details(self, root, mi, non_hero): table = non_hero.xpath('descendant::table')[0] for tr in table.xpath('descendant::tr'): cells = tr.xpath('descendant::td') if len(cells) == 2: name = self.totext(cells[0]) val = self.totext(cells[1]) if not val: continue if name in self.language_names: ans = self.lang_map.get(val, None) if not ans: ans = canonicalize_lang(val) if ans: mi.language = ans elif name in self.publisher_names: pub = val.partition(';')[0].partition('(')[0].strip() if pub: mi.publisher = pub date = val.rpartition('(')[-1].replace(')', '').strip() try: from calibre.utils.date import parse_only_date date = self.delocalize_datestr(date) mi.pubdate = parse_only_date(date, assume_utc=True) except: self.log.exception('Failed to parse pubdate: %s' % val) elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}: ans = check_isbn(val) if ans: self.isbn = mi.isbn = ans
def _format_isbn(log, isbn): # {{{ # for now only RUS ISBN are supported # http://ru.wikipedia.org/wiki/ISBN_российских_издательств isbn_pat = re.compile(r""" ^ (\d{3})? # match GS1 Prefix for ISBN13 (5) # group identifier for Russian-speaking countries ( # begin variable length for Publisher [01]\d{1}| # 2x [2-6]\d{2}| # 3x 7\d{3}| # 4x (starting with 7) 8[0-4]\d{2}| # 4x (starting with 8) 9[2567]\d{2}| # 4x (starting with 9) 99[26]\d{1}| # 4x (starting with 99) 8[5-9]\d{3}| # 5x (starting with 8) 9[348]\d{3}| # 5x (starting with 9) 900\d{2}| # 5x (starting with 900) 91[0-8]\d{2}| # 5x (starting with 91) 90[1-9]\d{3}| # 6x (starting with 90) 919\d{3}| # 6x (starting with 919) 99[^26]\d{4} # 7x (starting with 99) ) # end variable length for Publisher (\d+) # Title ([\dX]) # Check digit $ """, re.VERBOSE) res = check_isbn(isbn) if res: m = isbn_pat.match(res) if m: res = '-'.join([g for g in m.groups() if g]) else: log.error('cannot format ISBN %s. Fow now only russian ISBNs are supported' % isbn) return res
def clean_downloaded_metadata(self, mi): if mi.title and self.domain in ('com', 'uk'): mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if self.domain in ('com', 'uk'): mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict={}): # {{{ req_isbn = identifiers.get('isbn', None) for mi in metadata: if abort.is_set(): break try: ozon_id = mi.identifiers['ozon'] try: self.get_book_details(log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and ozon_id in cachedPagesDict else None) except: log.exception(u'Failed to get details for metadata: %s' % mi.title) all_isbns = getattr(mi, 'all_isbns', []) if req_isbn and all_isbns and check_isbn(req_isbn) not in all_isbns: log.debug(u'skipped, no requested ISBN %s found' % req_isbn) continue for isbn in all_isbns: self.cache_isbn_to_identifier(isbn, ozon_id) if mi.ozon_cover_url: self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) self.clean_downloaded_metadata(mi) result_queue.put(mi) except: log.exception(u'Failed to get details for metadata: %s' % mi.title)
def create_query(self, log, title=None, authors=None, identifiers={}): from urllib import urlencode BASE_URL = 'http://edelweiss.abovethetreeline.com/Browse.aspx?source=catalog&rg=4187&group=browse&pg=0&' params = { 'browseType': 'title', 'startIndex': 0, 'savecook': 1, 'sord': 20, 'secSord': 20, 'tertSord': 20, } keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) # Searching with author names does not work on edelweiss # author_tokens = self.get_author_tokens(authors, # only_first_author=True) # if author_tokens: # keywords.extend(author_tokens) if not keywords: return None params['bsk'] = (' '.join(keywords)).encode('utf-8') return BASE_URL + urlencode(params)
def create_query(self, title=None, authors=None, identifiers={}): # {{{ base_url = BASE_URL % self.isbndb_key isbn = check_isbn(identifiers.get('isbn', None)) q = '' if isbn is not None: q = 'index1=isbn&value1=' + isbn elif title or authors: tokens = [] title_tokens = list(self.get_title_tokens(title)) tokens += title_tokens author_tokens = self.get_author_tokens(authors, only_first_author=True) tokens += author_tokens tokens = [ quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in tokens ] q = '+'.join(tokens) q = 'index1=combined&value1=' + q if not q: return None if isinstance(q, unicode): q = q.encode('utf-8') return base_url + q
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if data.has_key('title'): mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif data.has_key('creator'): mi.authors = string_to_authors(data['creator']) if data.has_key('description'): mi.comments = data['description'] if data.has_key('language'): mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import urlencode BASE_URL = 'https://books.google.com/books/feeds/volumes?' isbn = check_isbn(identifiers.get('isbn', None)) q = '' if isbn is not None: q += 'isbn:' + isbn elif title or authors: def build_term(prefix, parts): return ' '.join('in' + prefix + ':' + x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: q += ('+' if q else '') + build_term('author', author_tokens) if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None return BASE_URL + urlencode({ 'q': q, 'max-results': 20, 'start-index': 1, 'min-viewability': 'none', })
def create_query(self, log, title=None, authors=None, identifiers={}): try: from urllib.parse import urlencode except ImportError: from urllib import urlencode import time BASE_URL = ('https://www.edelweiss.plus/GetTreelineControl.aspx?' 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&') keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None params = { 'q': (' '.join(keywords)).encode('utf-8'), '_': str(int(time.time())) } return BASE_URL+urlencode(params)
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): md = self.worker_class(self.browser, timeout) d = {} idval = identifiers.get(self.idkey, None) isbn = identifiers.get('isbn', None) if idval: d['id'] = idval if isbn: d['isbn'] = isbn if title: d['title'] = title if authors: d['authors'] = authors md.query(d, maxresults = self.maxresults) while not abort.is_set(): md.join(0.2) if abort.is_set(): break if not md.is_alive(): break time.sleep(self.sleep_time) if not abort.is_set(): for i in range(0,len(md.ans)): mi = self.data2mi(md.ans[i]) mi.source_relevance = i # Less means more relevant. mi.isbn = check_isbn(mi.isbn) result_queue.put(mi) return None
def create_query(self, log, title=None, authors=None, identifiers={}, domain=None): # {{{ from urllib import urlencode if domain is None: domain = self.domain idomain, asin = self.get_domain_and_asin(identifiers) if idomain is not None: domain = idomain # See the amazon detailed search page to get all options q = {"search-alias": "aps", "unfiltered": "1"} if domain == "com": q["sort"] = "relevanceexprank" else: q["sort"] = "relevancerank" isbn = check_isbn(identifiers.get("isbn", None)) if asin is not None: q["field-keywords"] = asin elif isbn is not None: q["field-isbn"] = isbn else: # Only return book results q["search-alias"] = {"br": "digital-text", "nl": "aps"}.get(domain, "stripbooks") if title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: q["field-title"] = " ".join(title_tokens) if authors: author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: q["field-author"] = " ".join(author_tokens) if not ("field-keywords" in q or "field-isbn" in q or ("field-title" in q)): # Insufficient metadata to make an identify query return None, None # magic parameter to enable Japanese Shift_JIS encoding. if domain == "jp": q["__mk_ja_JP"] = "カタカナ" if domain == "nl": q["__mk_nl_NL"] = "ÅMÅŽÕÑ" if "field-keywords" not in q: q["field-keywords"] = "" for f in "field-isbn field-title field-author".split(): q["field-keywords"] += " " + q.pop(f, "") q["field-keywords"] = q["field-keywords"].strip() if domain == "jp": encode_to = "Shift_JIS" elif domain == "nl": encode_to = "utf-8" else: encode_to = "latin1" encoded_q = dict([(x.encode(encode_to, "ignore"), y.encode(encode_to, "ignore")) for x, y in q.iteritems()]) url = "http://www.amazon.%s/s/?" % self.get_website_domain(domain) + urlencode(encoded_q) return url, domain
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import urlencode BASE_URL = 'https://books.google.com/books/feeds/volumes?' isbn = check_isbn(identifiers.get('isbn', None)) q = '' if isbn is not None: q += 'isbn:'+isbn elif title or authors: def build_term(prefix, parts): return ' '.join('in'+prefix + ':' + x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: q += ('+' if q else '') + build_term('author', author_tokens) if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None return BASE_URL+urlencode({ 'q':q, 'max-results':20, 'start-index':1, 'min-viewability':'none', })
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import urlencode BASE_URL = "http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?" params = {"group": "search", "searchType": 999, "searchOrgID": "", "dateRange": 0, "isbn": ""} for num in (0, 1, 2, 3, 4, 5, 6, 200, 201, 202, 204): params["condition%d" % num] = 1 params["keywords%d" % num] = "" title_key, author_key = "keywords200", "keywords201" isbn = check_isbn(identifiers.get("isbn", None)) found = False if isbn is not None: params["isbn"] = isbn found = True elif title or authors: title_tokens = list(self.get_title_tokens(title)) if title_tokens: params[title_key] = " ".join(title_tokens) found = True author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: params[author_key] = " ".join(author_tokens) found = True if not found: return None for k in (title_key, author_key, "isbn"): v = params[k] if isinstance(v, unicode): params[k] = v.encode("utf-8") return BASE_URL + urlencode(params)
def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ req_isbn = identifiers.get('isbn', None) for mi in metadata: if abort.is_set(): break try: ozon_id = mi.identifiers['ozon'] try: self.get_book_details(log, mi, timeout) except: log.exception(u'Failed to get details for metadata: %s' % mi.title) all_isbns = getattr(mi, 'all_isbns', []) if req_isbn and all_isbns and check_isbn( req_isbn) not in all_isbns: log.debug(u'skipped, no requested ISBN %s found' % req_isbn) continue for isbn in all_isbns: self.cache_isbn_to_identifier(isbn, ozon_id) if mi.ozon_cover_url: self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) self.clean_downloaded_metadata(mi) result_queue.put(mi) except: log.exception(u'Failed to get details for metadata: %s' % mi.title)
def test(mi): misbn = check_isbn(mi.isbn) if misbn and misbn == isbn_: return True prints('ISBN test failed. Expected: \'%s\' found \'%s\'' % (isbn_, misbn)) return False
def create_query(self, log, title=None, authors=None, identifiers={}): isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: return '%stype=ISBN&arg=%s' % (ISFDB.SEARCH_URL, isbn) if title: title = title.replace('?', '') title_tokens = self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True) title_tokens = [quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in title_tokens] search_title = '+'.join(title_tokens) if authors: author_tokens = self.get_author_tokens(authors, only_first_author=True) author_tokens = [quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in author_tokens] search_author = '+'.join(author_tokens) log.info("TITLE %s" % search_title) log.info("AUTHOR %s" % search_author) if not title_tokens and not author_tokens: return None elif title_tokens and author_tokens: # Currently the third term is unused; we could pass in a publisher. return '%sUSE_1=pub_title&OPERATOR_1=contains&TERM_1=%s&CONJUNCTION_1=AND' \ '&USE_2=author_canonical&OPERATOR_2=contains&TERM_2=%s' \ '&ORDERBY=pub_title&START=0&TYPE=Publication' % (ISFDB.ADV_SEARCH_URL, search_title, search_author) elif title_tokens: return '%sUSE_1=pub_title&OPERATOR_1=contains&TERM_1=%s' \ '&ORDERBY=pub_title&START=0&TYPE=Publication' % (ISFDB.ADV_SEARCH_URL, search_title) elif author_tokens: return '%sUSE_1=author_canonical&OPERATOR_1=contains&TERM_1=%s' \ '&ORDERBY=pub_title&START=0&TYPE=Publication' % (ISFDB.ADV_SEARCH_URL, search_author)
def get_book_url(self, identifiers): aladin_id = identifiers.get('aladin.co.kr', None) isbn = check_isbn(identifiers.get('isbn', None)) if isbn: return ('aladin.co.kr', aladin_id, '%s/shop/wproduct.aspx?ISBN=%s' % (Aladin_co_kr.BASE_URL, isbn)) elif aladin_id: return ('aladin.co.kr', aladin_id, '%s/shop/wproduct.aspx?ItemId=%s' % (Aladin_co_kr.BASE_URL, aladin_id))
def accept(self): isbn = unicode(self.line_edit.text()) if not check_isbn(isbn): return error_dialog(self, _('Invalid ISBN'), _('The ISBN you entered is not valid. Try again.'), show=True) QDialog.accept(self)
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if 'title' in data: mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif 'creator' in data: mi.authors = string_to_authors(data['creator']) if 'description' in data: mi.comments = data['description'] if 'language' in data: mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def create_query(self, log, title=None, authors=None, identifiers={}): try: from urllib.parse import urlencode except ImportError: from urllib import urlencode import time BASE_URL = 'https://search.jd.com/Search?' keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) if self.prefs['add_authors']: author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None word = (' '.join(keywords)).encode('utf-8') params = {'keyword': word, 'enc': 'utf-8', 'wp': word, 'book': 'y'} return BASE_URL + urlencode(params)
def create_query(self, log, title=None, authors=None, identifiers={}): try: from urllib.parse import urlencode except ImportError: from urllib import urlencode import time BASE_URL = ( 'https://www.edelweiss.plus/GetTreelineControl.aspx?' 'controlName=/uc/listviews/controls/ListView_data.ascx&itemID=0&resultType=32&dashboardType=8&itemType=1&dataType=products&keywordSearch&' ) keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None params = { 'q': (' '.join(keywords)).encode('utf-8'), '_': type('')(int(time.time())) } return BASE_URL + urlencode(params)
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = {'english':'eng', 'french':'fra', 'german':'deu', 'spanish':'spa'}.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def clean_downloaded_metadata(self, mi): docase = mi.language == "eng" or (mi.is_null("language") and self.domain in {"com", "uk"}) if mi.title and docase: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def clean_downloaded_metadata(self, mi): docase = (mi.language == 'eng') if mi.title and docase: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def _format_isbn(log, isbn): # {{{ res = check_isbn(isbn) if res: m = isbn_pat.match(res) if m: res = '-'.join([g for g in m.groups() if g]) else: log.error('cannot format isbn %s'%isbn) return res
def parse_isbn(self, pd): items = pd.xpath('descendant::*[starts-with(text(), "ISBN")]') if not items: items = pd.xpath('descendant::b[contains(text(), "ISBN:")]') for x in reversed(items): if x.tail: ans = check_isbn(x.tail.strip()) if ans: return ans
def accept(self): isbn = unicode(self.line_edit.text()) if not check_isbn(isbn): return error_dialog( self, _('Invalid ISBN'), _('The ISBN you entered is not valid. Try again.'), show=True) QDialog.accept(self)
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&' ISBN_URL = 'https://api.douban.com/v2/book/isbn/' SUBJECT_URL = 'https://api.douban.com/v2/book/' q = '' t = None isbn = check_isbn(identifiers.get('isbn', None)) subject = identifiers.get('douban', None) if isbn is not None: q = isbn t = 'isbn' elif subject is not None: q = subject t = 'subject' elif title or authors: def build_term(prefix, parts): return ' '.join(x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = list( self.get_author_tokens(authors, only_first_author=True)) if author_tokens: q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' q = q.strip() if isinstance(q, type(u'')): q = q.encode('utf-8') if not q: return None url = None if t == "isbn": url = ISBN_URL + q elif t == 'subject': url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ 'q': q, }) if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': if t == "isbn" or t == "subject": url = url + "?apikey=" + self.DOUBAN_API_KEY else: url = url + "&apikey=" + self.DOUBAN_API_KEY return url
def create_queries(self, log, title=None, authors=None, identifiers={}): queries = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: queries.append('bath.isbn=%s' % (isbn)) else: queries.append('dc.title=%s and dc.author=%s' % (title, authors)) return queries
def get_cached_cover_url(self, identifiers): """ """ book_id = identifiers.get("databazeknih", None) if book_id is None: book_id = check_isbn(identifiers.get("isbn", None)) url = self.cached_identifier_to_cover_url(book_id) return url
def _format_isbn(log, isbn): # {{{ res = check_isbn(isbn) if res: m = isbn_pat.match(res) if m: res = '-'.join([g for g in m.groups() if g]) else: log.error('cannot format isbn %s' % isbn) return res
def clean_downloaded_metadata(self, mi): docase = (mi.language == 'eng' or (mi.is_null('language') and self.domain in {'com', 'uk'})) if mi.title and docase: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def _parse_isbn(root, mi, ctx): # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root) if isbn: # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case if ',' in isbn: isbn = isbn[:isbn.index(',')] if check_isbn(isbn): mi.isbn = isbn
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ domain=None): from urllib import urlencode if domain is None: domain = self.domain idomain, asin = self.get_domain_and_asin(identifiers) if idomain is not None: domain = idomain # See the amazon detailed search page to get all options q = {'search-alias': 'aps', 'unfiltered': '1', } if domain == 'com': q['sort'] = 'relevanceexprank' else: q['sort'] = 'relevancerank' isbn = check_isbn(identifiers.get('isbn', None)) if asin is not None: q['field-keywords'] = asin elif isbn is not None: q['field-isbn'] = isbn else: # Only return book results q['search-alias'] = 'digital-text' if domain == 'br' else 'stripbooks' if title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: q['field-title'] = ' '.join(title_tokens) if authors: author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: q['field-author'] = ' '.join(author_tokens) if not ('field-keywords' in q or 'field-isbn' in q or ('field-title' in q)): # Insufficient metadata to make an identify query return None, None # magic parameter to enable Japanese Shift_JIS encoding. if domain == 'jp': q['__mk_ja_JP'] = u'カタカナ' if domain == 'jp': encode_to = 'Shift_JIS' else: encode_to = 'latin1' encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to, 'ignore')) for x, y in q.iteritems()]) url = 'http://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q) return url, domain
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode SEARCH_URL = "https://api.douban.com/v2/book/search?count=10&" ISBN_URL = "https://api.douban.com/v2/book/isbn/" SUBJECT_URL = "https://api.douban.com/v2/book/" q = "" t = None isbn = check_isbn(identifiers.get("isbn", None)) subject = identifiers.get("douban", None) if isbn is not None: q = isbn t = "isbn" elif subject is not None: q = subject t = "subject" elif title or authors: def build_term(prefix, parts): return " ".join(x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term("title", title_tokens) author_tokens = list( self.get_author_tokens(authors, only_first_author=True) ) if author_tokens: q += (" " if q != "" else "") + build_term("author", author_tokens) t = "search" q = q.strip() # if isinstance(q, type("")): # q = q.encode("utf-8") q = str(q) if not q: return None url = None if t == "isbn": url = ISBN_URL + q elif t == "subject": url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode( { "q": q, } ) if self.prefs.get("apikey"): if t == "isbn" or t == "subject": url = url + "?apikey=" + self.prefs["apikey"] else: url = url + "&apikey=" + self.prefs["apikey"] return url
def clean_downloaded_metadata(self, mi): docase = ( mi.language == 'eng' or mi.is_null('language') ) if mi.title and docase: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def get_cached_cover_url(self, identifiers): url = None databazeknih_id = identifiers.get(u'databazeknih', None) if databazeknih_id is None: isbn = check_isbn(identifiers.get(u'isbn', None)) if isbn is not None: databazeknih_id = self.cached_isbn_to_identifier(isbn) if databazeknih_id is not None: url = self.cached_identifier_to_cover_url(databazeknih_id) return url
def get_cached_cover_url(self, identifiers): url = None moly_id = identifiers.get('moly_hu', None) if moly_id is None: isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: moly_id = self.cached_isbn_to_identifier(isbn) if moly_id is not None: url = self.cached_identifier_to_cover_url(moly_id) return url
def clean_downloaded_metadata(self, mi): ''' Call this method in your plugin's identify method to normalize metadata before putting the Metadata object into result_queue. You can of course, use a custom algorithm suited to your metadata source. ''' if mi.title: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def isbn_test(isbn): isbn_ = check_isbn(isbn) def test(mi): misbn = check_isbn(mi.isbn) if misbn and misbn == isbn_: return True prints('ISBN test failed. Expected: \'%s\' found \'%s\''%(isbn_, misbn)) return False return test
def paste_isbn(self): text = unicode(QApplication.clipboard().text()).strip() if not text or not check_isbn(text): d = ISBNDialog(self, text) if not d.exec_(): return text = d.text() if not text: return vals = self.current_val vals['isbn'] = text self.current_val = vals
def clean_downloaded_metadata(self, mi): """ Call this method in your plugin's identify method to normalize metadata before putting the Metadata object into result_queue. You can of course, use a custom algorithm suited to your metadata source. """ docase = mi.language == "eng" or mi.is_null("language") if docase and mi.title: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn)
def checkText(self, txt): isbn = unicode(txt) if not isbn: col = 'none' extra = '' elif check_isbn(isbn) is not None: col = 'rgba(0,255,0,20%)' extra = _('This ISBN number is valid') else: col = 'rgba(255,0,0,20%)' extra = _('This ISBN number is invalid') self.line_edit.setToolTip(extra) self.line_edit.setStyleSheet('QLineEdit { background-color: %s }'%col)