def __init__(self, FIELD_MAP, field_metadata, db_prefs=None): self.FIELD_MAP = FIELD_MAP self.db_prefs = db_prefs self.composites = {} self.udc = get_udc() for key in field_metadata: if field_metadata[key]['datatype'] == 'composite': self.composites[field_metadata[key]['rec_index']] = key self.series_col = field_metadata['series']['rec_index'] self.series_sort_col = field_metadata['series_sort']['rec_index'] self._data = [] self._map = self._map_filtered = [] self.first_sort = True self.search_restriction = self.base_restriction = '' self.base_restriction_name = self.search_restriction_name = '' self.search_restriction_book_count = 0 self.marked_ids_dict = {} self.field_metadata = field_metadata self.all_search_locations = field_metadata.get_search_terms() SearchQueryParser.__init__(self, self.all_search_locations, optimize=True) self.build_date_relop_dict() self.build_numeric_relop_dict() # Do this here so the var get updated when a library changes global pref_use_primary_find_in_search pref_use_primary_find_in_search = prefs['use_primary_find_in_search']
def ret_clean_text(log, dbg_lvl, text, swap=False, who=''): # for noosfere search to work smoothly, authors and title needs to be cleaned # we need to remove non significant characters and remove useless space character # debug = dbg_lvl & 4 if debug: log.info("\nIn ret_clean_txt(self, log, text, swap =", swap, ")") log.info("text : ", text) # Calibre per default presents the author as "Firstname Lastname", cleaned to be become "firstname lastname" # Noosfere present the author as "LASTNAME Firstname", let's get "Firstname LASTNAME" cleaned to "firstname lastname" # for k in [',', '.', '-', "'", '"', '(', ')']: # yes I found a name with '(' and ')' in it... if k in text: text = text.replace(k, " ") text = " ".join(text.split()) if swap: if debug: log.info("swap name and surname") nom = prenom = "" for i in range(len(text.split())): if (len(text.split()[i]) == 1) or (not text.split()[i].isupper()): prenom += " " + text.split()[i] else: nom += " " + text.split()[i] text = prenom + " " + nom if debug: log.info("text : ", text) if debug: log.info("cleaned text : ", text) log.info("return text from ret_clean_txt") return lower(get_udc().decode(text))
def get_author_tokens(author, decode_non_ascii=True): ''' Take an author and return a list of tokens useful for duplicate hash comparisons. This function tries to return tokens in first name middle names last name order, by assuming that if a comma is in the author name, the name is in lastname, other names form. ''' ignore_suffixes = [ 'von', 'van', 'jr', 'sr', 'i', 'ii' 'iii', 'second', 'third', 'md', 'phd' ] if author: # Leave ' in there for Irish names remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') au = replace_pat.sub(' ', author) if decode_non_ascii: au = get_udc().decode(au) parts = au.split() if ',' in au: # au probably in ln, fn form parts = parts[1:] + parts[:1] for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in ignore_suffixes: yield tok.lower()
def ascii_text(orig): udc = get_udc() try: ascii = udc.decode(orig) except: if isinstance(orig, unicode): orig = orig.encode("ascii", "replace") ascii = orig.decode(preferred_encoding, "replace").encode("ascii", "replace") return ascii
def ascii_text(orig): udc = get_udc() try: ascii = udc.decode(orig) except: if isinstance(orig, unicode): orig = orig.encode('ascii', 'replace') ascii = orig.decode(preferred_encoding, 'replace').encode('ascii', 'replace') return ascii
def ascii_text(orig): udc = get_udc() try: ascii = udc.decode(orig) except: if isinstance(orig, unicode): ascii = orig.encode('ascii', 'replace') ascii = orig.decode(preferred_encoding, 'replace').encode('ascii', 'replace') return ascii
def ascii_text(orig): udc = get_udc() try: ascii = udc.decode(orig) except Exception: if isinstance(orig, unicode_type): orig = orig.encode('ascii', 'replace') ascii = orig.decode(preferred_encoding, 'replace') if isinstance(ascii, bytes): ascii = ascii.decode('ascii', 'replace') return ascii
def create_title_query(self, log, title=None): q = '' if title: title = get_udc().decode(title) tokens = [] title_tokens = list(self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) tokens = [quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in title_tokens] q = '+'.join(tokens) if not q: return None return '%s/vyhledavani?q=%s'%(BookFan.BASE_URL, q)
def get_tag_tokens(tag, decode_non_ascii=True): ''' Take a tag and return a list of tokens useful for duplicate hash comparisons. ''' ignore_words = ['the', 'and', 'a'] if tag: remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') t = replace_pat.sub(' ', tag) if decode_non_ascii: t = get_udc().decode(t) parts = t.split() for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in ignore_words: yield tok.lower()
def get_series_tokens(series, decode_non_ascii=True): ''' Take a series and return a list of tokens useful for duplicate hash comparisons. ''' ignore_words = ['the', 'a', 'and',] if series: remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') s = replace_pat.sub(' ', series) if decode_non_ascii: s = get_udc().decode(s) parts = s.split() for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in ignore_words: yield tok.lower()
def get_publisher_tokens(publisher, decode_non_ascii=True): ''' Take a publisher and return a list of tokens useful for duplicate hash comparisons. ''' ignore_words = ['the', 'inc', 'ltd', 'limited', 'llc', 'co', 'pty', 'usa', 'uk'] if publisher: remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') p = replace_pat.sub(' ', publisher) if decode_non_ascii: p = get_udc().decode(p) parts = p.split() for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in ignore_words: yield tok.lower()
def get_publisher_tokens(publisher, decode_non_ascii=True): ''' Take a publisher and return a list of tokens useful for duplicate hash comparisons. ''' ignore_words = [ 'the', 'inc', 'ltd', 'limited', 'llc', 'co', 'pty', 'usa', 'uk' ] if publisher: remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') p = replace_pat.sub(' ', publisher) if decode_non_ascii: p = get_udc().decode(p) parts = p.split() for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in ignore_words: yield tok.lower()
def get_title_tokens(title, strip_subtitle=True, decode_non_ascii=True): ''' Take a title and return a list of tokens useful for an AND search query. Excludes subtitles (optionally), punctuation and a, the. ''' if title: # strip sub-titles if strip_subtitle: subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)') if len(subtitle.sub('', title)) > 1: title = subtitle.sub('', title) title_patterns = [ (re.compile(pat, re.IGNORECASE), repl) for pat, repl in [ # Remove things like: (2010) (Omnibus) etc. (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]', ''), # Remove any strings that contain the substring edition inside # parentheses (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''), # Remove commas used a separators in numbers (r'(\d+),(\d+)', r'\1\2'), # Remove hyphens only if they have whitespace before them (r'(\s-)', ' '), # Remove single quotes not followed by 's' (r"'(?!s)", ''), # Replace other special chars with a space (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ') ] ] for pat, repl in title_patterns: title = pat.sub(repl, title) if decode_non_ascii: title = get_udc().decode(title) tokens = title.split() for token in tokens: token = token.strip() if token and (token.lower() not in ('a', 'the')): yield token.lower()
def get_title_tokens(title, strip_subtitle=True, decode_non_ascii=True): ''' Take a title and return a list of tokens useful for an AND search query. Excludes subtitles (optionally), punctuation and a, the. ''' if title: # strip sub-titles if strip_subtitle: subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)') if len(subtitle.sub('', title)) > 1: title = subtitle.sub('', title) title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in [ # Remove things like: (2010) (Omnibus) etc. (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]', ''), # Remove any strings that contain the substring edition inside # parentheses (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''), # Remove commas used a separators in numbers (r'(\d+),(\d+)', r'\1\2'), # Remove hyphens only if they have whitespace before them (r'(\s-)', ' '), # Remove single quotes not followed by 's' (r"'(?!s)", ''), # Replace other special chars with a space (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ') ]] for pat, repl in title_patterns: title = pat.sub(repl, title) if decode_non_ascii: title = get_udc().decode(title) tokens = title.split() for token in tokens: token = token.strip() if token and (token.lower() not in ('a', 'the')): yield token.lower()
def get_series_tokens(series, decode_non_ascii=True): ''' Take a series and return a list of tokens useful for duplicate hash comparisons. ''' ignore_words = [ 'the', 'a', 'and', ] if series: remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') s = replace_pat.sub(' ', series) if decode_non_ascii: s = get_udc().decode(s) parts = s.split() for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in ignore_words: yield tok.lower()
def get_author_tokens(author, decode_non_ascii=True): ''' Take an author and return a list of tokens useful for duplicate hash comparisons. This function tries to return tokens in first name middle names last name order, by assuming that if a comma is in the author name, the name is in lastname, other names form. ''' if author: # Leave ' in there for Irish names remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]') replace_pat = re.compile(r'[-+.:;]') au = replace_pat.sub(' ', author) if decode_non_ascii: au = get_udc().decode(au) parts = au.split() if ',' in au: # au probably in ln, fn form parts = parts[1:] + parts[:1] for tok in parts: tok = remove_pat.sub('', tok).strip() if len(tok) > 0 and tok.lower() not in IGNORE_AUTHOR_WORDS_MAP: yield tok.lower()
def __call__(self, html, remove_special_chars=None, get_preprocess_html=False): if remove_special_chars is not None: html = remove_special_chars.sub('', html) html = html.replace('\0', '') is_pdftohtml = self.is_pdftohtml(html) if self.is_baen(html): rules = [] elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif is_pdftohtml: rules = self.PDFTOHTML else: rules = [] start_rules = [] if not getattr(self.extra_opts, 'keep_ligatures', False): html = _ligpat.sub(lambda m:LIGATURES[m.group()], html) user_sr_rules = {} # Function for processing search and replace def do_search_replace(search_pattern, replace_txt): from calibre.ebooks.conversion.search_replace import compile_regular_expression try: search_re = compile_regular_expression(search_pattern) if not replace_txt: replace_txt = '' rules.insert(0, (search_re, replace_txt)) user_sr_rules[(search_re, replace_txt)] = search_pattern except Exception as e: self.log.error('Failed to parse %r regexp because %s' % (search, as_unicode(e))) # search / replace using the sr?_search / sr?_replace options for i in range(1, 4): search, replace = 'sr%d_search'%i, 'sr%d_replace'%i search_pattern = getattr(self.extra_opts, search, '') replace_txt = getattr(self.extra_opts, replace, '') if search_pattern: do_search_replace(search_pattern, replace_txt) # multi-search / replace using the search_replace option search_replace = getattr(self.extra_opts, 'search_replace', None) if search_replace: search_replace = json.loads(search_replace) for search_pattern, replace_txt in reversed(search_replace): do_search_replace(search_pattern, replace_txt) end_rules = [] # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens end_rules.append((re.compile(u'[](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: docanalysis = DocAnalysis('pdf', html) length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor')) if length: # print "The pdf line length returned is " + str(length) # unwrap em/en dashes end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), # noqa ) for rule in self.PREPROCESS + start_rules: html = rule[0].sub(rule[1], html) if self.regex_wizard_callback is not None: self.regex_wizard_callback(self.current_href, html) if get_preprocess_html: return html def dump(raw, where): import os dp = getattr(self.extra_opts, 'debug_pipeline', None) if dp and os.path.exists(dp): odir = os.path.join(dp, 'input') if os.path.exists(odir): odir = os.path.join(odir, where) if not os.path.exists(odir): os.makedirs(odir) name, i = None, 0 while not name or os.path.exists(os.path.join(odir, name)): i += 1 name = '%04d.html'%i with open(os.path.join(odir, name), 'wb') as f: f.write(raw.encode('utf-8')) # dump(html, 'pre-preprocess') for rule in rules + end_rules: try: html = rule[0].sub(rule[1], html) except Exception as e: if rule in user_sr_rules: self.log.error( 'User supplied search & replace rule: %s -> %s ' 'failed with error: %s, ignoring.'%( user_sr_rules[rule], rule[1], e)) else: raise if is_pdftohtml and length > -1: # Dehyphenate dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log) html = dehyphenator(html,'html', length) if is_pdftohtml: from calibre.ebooks.conversion.utils import HeuristicProcessor pdf_markup = HeuristicProcessor(self.extra_opts, None) totalwords = 0 if pdf_markup.get_word_count(html) > 7000: html = pdf_markup.markup_chapters(html, totalwords, True) # dump(html, 'post-preprocess') # Handle broken XHTML w/ SVG (ugh) if 'svg:' in html and SVG_NS not in html: html = html.replace( '<html', '<html xmlns:svg="%s"' % SVG_NS, 1) if 'xlink:' in html and XLINK_NS not in html: html = html.replace( '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1) html = XMLDECL_RE.sub('', html) if getattr(self.extra_opts, 'asciiize', False): from calibre.utils.localization import get_udc from calibre.utils.mreplace import MReplace unihandecoder = get_udc() mr = MReplace(data={u'«':u'<'*3, u'»':u'>'*3}) html = mr.mreplace(html) html = unihandecoder.decode(html) if getattr(self.extra_opts, 'enable_heuristics', False): from calibre.ebooks.conversion.utils import HeuristicProcessor preprocessor = HeuristicProcessor(self.extra_opts, self.log) html = preprocessor(html) if is_pdftohtml: html = html.replace('<!-- created by calibre\'s pdftohtml -->', '') if getattr(self.extra_opts, 'smarten_punctuation', False): html = smarten_punctuation(html, self.log) try: unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars except AttributeError: unsupported_unicode_chars = u'' if unsupported_unicode_chars: from calibre.utils.localization import get_udc unihandecoder = get_udc() for char in unsupported_unicode_chars: asciichar = unihandecoder.decode(char) html = html.replace(char, asciichar) return html
def similar_title_match(title, lang=None): title = get_udc().decode(title) result = fuzzy_it(title) if lang: return lang + result return result
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): log.info("identify") ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] # If we have an ISFDB id then we do not need to fire a "search". # Instead we will go straight to the URL for that book. isfdb_id = identifiers.get('isfdb', None) isbn = check_isbn(identifiers.get('isbn', None)) br = self.browser if isfdb_id: matches.append('%s/cgi-bin/pl.cgi?%s' % (ISFDB.BASE_URL, isfdb_id)) else: title = get_udc().decode(title) authors = authors or [] authors = [get_udc().decode(a) for a in authors] query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query. Alas!') return isbn_match_failed = False try: log.info('Querying: %s' % query) response = br.open_novisit(query, timeout=timeout) raw = response.read().decode('cp1252', errors='replace').strip() if isbn: # Check whether we got redirected to a book page for ISBN searches. # If we did, will use the url. # If we didn't then treat it as no matches on ISFDB location = response.geturl() # If not an exact match on ISBN we can get a search results page back # XMS: This may be terribly different for ISFDB. # XMS: HOWEVER: 1563890933 returns multiple results! isbn_match_failed = location.find('/pl.cgi') < 0 if raw.find('found 0 matches') == -1 and not isbn_match_failed: log.info('ISBN match location: %r' % location) matches.append(location) except Exception as e: if isbn and callable(getattr(e, 'getcode', None)) and e.getcode() == 404: # We did a lookup by ISBN but did not find a match # We will fallback to doing a lookup by title author log.info('Failed to find match for ISBN: %s' % isbn) elif callable(getattr(e, 'getcode', None)) and e.getcode() == 404: log.error('No matches for identify query') return as_unicode(e) else: err = 'Failed to make identify query' log.exception(err) return as_unicode(e) # For successful ISBN-based searches we have already done everything we need to. # So anything from this point below is for title/author based searches. if not isbn or isbn_match_failed: try: root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse ISFDB page for query' log.exception(msg) return msg # Now grab the matches from the search results, provided the # title and authors appear to be for the same book self._parse_search_results(log, title, authors, root, matches, timeout) if abort.is_set(): return if not matches: if identifiers and title and authors: log.info('No matches found with identifiers, retrying using only title and authors') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r' % query) return from calibre_plugins.isfdb.worker import Worker workers = [Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches)] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None
def similar_title_match(title): title = get_udc().decode(title) return fuzzy_it(title)