Example #1
0
 def __init__(self, FIELD_MAP, field_metadata, db_prefs=None):
     self.FIELD_MAP = FIELD_MAP
     self.db_prefs = db_prefs
     self.composites = {}
     self.udc = get_udc()
     for key in field_metadata:
         if field_metadata[key]['datatype'] == 'composite':
             self.composites[field_metadata[key]['rec_index']] = key
     self.series_col = field_metadata['series']['rec_index']
     self.series_sort_col = field_metadata['series_sort']['rec_index']
     self._data = []
     self._map = self._map_filtered = []
     self.first_sort = True
     self.search_restriction = self.base_restriction = ''
     self.base_restriction_name = self.search_restriction_name = ''
     self.search_restriction_book_count = 0
     self.marked_ids_dict = {}
     self.field_metadata = field_metadata
     self.all_search_locations = field_metadata.get_search_terms()
     SearchQueryParser.__init__(self, self.all_search_locations, optimize=True)
     self.build_date_relop_dict()
     self.build_numeric_relop_dict()
     # Do this here so the var get updated when a library changes
     global pref_use_primary_find_in_search
     pref_use_primary_find_in_search = prefs['use_primary_find_in_search']
Example #2
0
def ret_clean_text(log, dbg_lvl, text, swap=False, who=''):
    # for noosfere search to work smoothly, authors and title needs to be cleaned
    # we need to remove non significant characters and remove useless space character
    #
    debug = dbg_lvl & 4
    if debug:
        log.info("\nIn ret_clean_txt(self, log, text, swap =", swap, ")")
        log.info("text         : ", text)

    # Calibre per default presents the author as "Firstname Lastname", cleaned to be become "firstname lastname"
    # Noosfere present the author as "LASTNAME Firstname", let's get "Firstname LASTNAME" cleaned to "firstname lastname"
    #
    for k in [',', '.', '-', "'", '"', '(',
              ')']:  # yes I found a name with '(' and ')' in it...
        if k in text:
            text = text.replace(k, " ")
    text = " ".join(text.split())

    if swap:
        if debug:
            log.info("swap name and surname")
        nom = prenom = ""
        for i in range(len(text.split())):
            if (len(text.split()[i]) == 1) or (not text.split()[i].isupper()):
                prenom += " " + text.split()[i]
            else:
                nom += " " + text.split()[i]
        text = prenom + " " + nom
        if debug: log.info("text         : ", text)

    if debug:
        log.info("cleaned text : ", text)
        log.info("return text from ret_clean_txt")

    return lower(get_udc().decode(text))
def get_author_tokens(author, decode_non_ascii=True):
    '''
    Take an author and return a list of tokens useful for duplicate
    hash comparisons. This function tries to return tokens in
    first name middle names last name order, by assuming that if a comma is
    in the author name, the name is in lastname, other names form.
    '''

    ignore_suffixes = [
        'von', 'van', 'jr', 'sr', 'i', 'ii'
        'iii', 'second', 'third', 'md', 'phd'
    ]
    if author:
        # Leave ' in there for Irish names
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        au = replace_pat.sub(' ', author)
        if decode_non_ascii:
            au = get_udc().decode(au)
        parts = au.split()
        if ',' in au:
            # au probably in ln, fn form
            parts = parts[1:] + parts[:1]
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_suffixes:
                yield tok.lower()
Example #4
0
def ascii_text(orig):
    udc = get_udc()
    try:
        ascii = udc.decode(orig)
    except:
        if isinstance(orig, unicode):
            orig = orig.encode("ascii", "replace")
        ascii = orig.decode(preferred_encoding, "replace").encode("ascii", "replace")
    return ascii
Example #5
0
def ascii_text(orig):
    udc = get_udc()
    try:
        ascii = udc.decode(orig)
    except:
        if isinstance(orig, unicode):
            orig = orig.encode('ascii', 'replace')
        ascii = orig.decode(preferred_encoding,
                            'replace').encode('ascii', 'replace')
    return ascii
Example #6
0
def ascii_text(orig):
    udc = get_udc()
    try:
        ascii = udc.decode(orig)
    except:
        if isinstance(orig, unicode):
            ascii = orig.encode('ascii', 'replace')
        ascii = orig.decode(preferred_encoding,
                'replace').encode('ascii', 'replace')
    return ascii
Example #7
0
def ascii_text(orig):
    udc = get_udc()
    try:
        ascii = udc.decode(orig)
    except Exception:
        if isinstance(orig, unicode_type):
            orig = orig.encode('ascii', 'replace')
        ascii = orig.decode(preferred_encoding, 'replace')
    if isinstance(ascii, bytes):
        ascii = ascii.decode('ascii', 'replace')
    return ascii
Example #8
0
def ascii_text(orig):
    udc = get_udc()
    try:
        ascii = udc.decode(orig)
    except Exception:
        if isinstance(orig, unicode_type):
            orig = orig.encode('ascii', 'replace')
        ascii = orig.decode(preferred_encoding, 'replace')
    if isinstance(ascii, bytes):
        ascii = ascii.decode('ascii', 'replace')
    return ascii
Example #9
0
 def create_title_query(self, log, title=None):
     q = ''
     if title:
         title = get_udc().decode(title)
         tokens = []
         title_tokens = list(self.get_title_tokens(title,
                             strip_joiners=False, strip_subtitle=True))
         tokens = [quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in title_tokens]
         q = '+'.join(tokens)
     if not q:
         return None
     return '%s/vyhledavani?q=%s'%(BookFan.BASE_URL, q)
Example #10
0
def get_tag_tokens(tag, decode_non_ascii=True):
    '''
    Take a tag and return a list of tokens useful for duplicate
    hash comparisons.
    '''

    ignore_words = ['the', 'and', 'a']
    if tag:
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        t = replace_pat.sub(' ', tag)
        if decode_non_ascii:
            t = get_udc().decode(t)
        parts = t.split()
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_words:
                yield tok.lower()
Example #11
0
def get_tag_tokens(tag, decode_non_ascii=True):
    '''
    Take a tag and return a list of tokens useful for duplicate
    hash comparisons.
    '''

    ignore_words = ['the', 'and', 'a']
    if tag:
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        t = replace_pat.sub(' ', tag)
        if decode_non_ascii:
            t = get_udc().decode(t)
        parts = t.split()
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_words:
                yield tok.lower()
Example #12
0
def get_series_tokens(series, decode_non_ascii=True):
    '''
    Take a series and return a list of tokens useful for duplicate
    hash comparisons.
    '''

    ignore_words = ['the', 'a', 'and',]
    if series:
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        s = replace_pat.sub(' ', series)
        if decode_non_ascii:
            s = get_udc().decode(s)
        parts = s.split()
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_words:
                yield tok.lower()
Example #13
0
def get_publisher_tokens(publisher, decode_non_ascii=True):
    '''
    Take a publisher and return a list of tokens useful for duplicate
    hash comparisons.
    '''

    ignore_words = ['the', 'inc', 'ltd', 'limited', 'llc', 'co', 'pty',
                    'usa', 'uk']
    if publisher:
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        p = replace_pat.sub(' ', publisher)
        if decode_non_ascii:
            p = get_udc().decode(p)
        parts = p.split()
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_words:
                yield tok.lower()
Example #14
0
def get_publisher_tokens(publisher, decode_non_ascii=True):
    '''
    Take a publisher and return a list of tokens useful for duplicate
    hash comparisons.
    '''

    ignore_words = [
        'the', 'inc', 'ltd', 'limited', 'llc', 'co', 'pty', 'usa', 'uk'
    ]
    if publisher:
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        p = replace_pat.sub(' ', publisher)
        if decode_non_ascii:
            p = get_udc().decode(p)
        parts = p.split()
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_words:
                yield tok.lower()
Example #15
0
def get_title_tokens(title, strip_subtitle=True, decode_non_ascii=True):
    '''
    Take a title and return a list of tokens useful for an AND search query.
    Excludes subtitles (optionally), punctuation and a, the.
    '''
    if title:
        # strip sub-titles
        if strip_subtitle:
            subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
            if len(subtitle.sub('', title)) > 1:
                title = subtitle.sub('', title)

        title_patterns = [
            (re.compile(pat, re.IGNORECASE), repl) for pat, repl in [
                # Remove things like: (2010) (Omnibus) etc.
                (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]',
                 ''),
                # Remove any strings that contain the substring edition inside
                # parentheses
                (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''),
                # Remove commas used a separators in numbers
                (r'(\d+),(\d+)', r'\1\2'),
                # Remove hyphens only if they have whitespace before them
                (r'(\s-)', ' '),
                # Remove single quotes not followed by 's'
                (r"'(?!s)", ''),
                # Replace other special chars with a space
                (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
            ]
        ]

        for pat, repl in title_patterns:
            title = pat.sub(repl, title)

        if decode_non_ascii:
            title = get_udc().decode(title)
        tokens = title.split()
        for token in tokens:
            token = token.strip()
            if token and (token.lower() not in ('a', 'the')):
                yield token.lower()
Example #16
0
def get_title_tokens(title, strip_subtitle=True, decode_non_ascii=True):
    '''
    Take a title and return a list of tokens useful for an AND search query.
    Excludes subtitles (optionally), punctuation and a, the.
    '''
    if title:
        # strip sub-titles
        if strip_subtitle:
            subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
            if len(subtitle.sub('', title)) > 1:
                title = subtitle.sub('', title)

        title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
        [
            # Remove things like: (2010) (Omnibus) etc.
            (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]', ''),
            # Remove any strings that contain the substring edition inside
            # parentheses
            (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''),
            # Remove commas used a separators in numbers
            (r'(\d+),(\d+)', r'\1\2'),
            # Remove hyphens only if they have whitespace before them
            (r'(\s-)', ' '),
            # Remove single quotes not followed by 's'
            (r"'(?!s)", ''),
            # Replace other special chars with a space
            (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
        ]]

        for pat, repl in title_patterns:
            title = pat.sub(repl, title)

        if decode_non_ascii:
            title = get_udc().decode(title)
        tokens = title.split()
        for token in tokens:
            token = token.strip()
            if token and (token.lower() not in ('a', 'the')):
                yield token.lower()
Example #17
0
def get_series_tokens(series, decode_non_ascii=True):
    '''
    Take a series and return a list of tokens useful for duplicate
    hash comparisons.
    '''

    ignore_words = [
        'the',
        'a',
        'and',
    ]
    if series:
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        s = replace_pat.sub(' ', series)
        if decode_non_ascii:
            s = get_udc().decode(s)
        parts = s.split()
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in ignore_words:
                yield tok.lower()
Example #18
0
def get_author_tokens(author, decode_non_ascii=True):
    '''
    Take an author and return a list of tokens useful for duplicate
    hash comparisons. This function tries to return tokens in
    first name middle names last name order, by assuming that if a comma is
    in the author name, the name is in lastname, other names form.
    '''

    if author:
        # Leave ' in there for Irish names
        remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]')
        replace_pat = re.compile(r'[-+.:;]')
        au = replace_pat.sub(' ', author)
        if decode_non_ascii:
            au = get_udc().decode(au)
        parts = au.split()
        if ',' in au:
            # au probably in ln, fn form
            parts = parts[1:] + parts[:1]
        for tok in parts:
            tok = remove_pat.sub('', tok).strip()
            if len(tok) > 0 and tok.lower() not in IGNORE_AUTHOR_WORDS_MAP:
                yield tok.lower()
Example #19
0
    def __call__(self, html, remove_special_chars=None,
            get_preprocess_html=False):
        if remove_special_chars is not None:
            html = remove_special_chars.sub('', html)
        html = html.replace('\0', '')
        is_pdftohtml = self.is_pdftohtml(html)
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif is_pdftohtml:
            rules = self.PDFTOHTML
        else:
            rules = []

        start_rules = []

        if not getattr(self.extra_opts, 'keep_ligatures', False):
            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)

        user_sr_rules = {}
        # Function for processing search and replace

        def do_search_replace(search_pattern, replace_txt):
            from calibre.ebooks.conversion.search_replace import compile_regular_expression
            try:
                search_re = compile_regular_expression(search_pattern)
                if not replace_txt:
                    replace_txt = ''
                rules.insert(0, (search_re, replace_txt))
                user_sr_rules[(search_re, replace_txt)] = search_pattern
            except Exception as e:
                self.log.error('Failed to parse %r regexp because %s' %
                        (search, as_unicode(e)))

        # search / replace using the sr?_search / sr?_replace options
        for i in range(1, 4):
            search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
            search_pattern = getattr(self.extra_opts, search, '')
            replace_txt = getattr(self.extra_opts, replace, '')
            if search_pattern:
                do_search_replace(search_pattern, replace_txt)

        # multi-search / replace using the search_replace option
        search_replace = getattr(self.extra_opts, 'search_replace', None)
        if search_replace:
            search_replace = json.loads(search_replace)
            for search_pattern, replace_txt in reversed(search_replace):
                do_search_replace(search_pattern, replace_txt)

        end_rules = []
        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
            # unwrap/delete soft hyphens
            end_rules.append((re.compile(u'[­](</p>\s*<p>\s*)+\s*(?=[[a-z\d])'), lambda match: ''))
            # unwrap/delete soft hyphens with formatting
            end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))

        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            docanalysis = DocAnalysis('pdf', html)
            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
                # print "The pdf line length returned is " + str(length)
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),  # noqa
                )

        for rule in self.PREPROCESS + start_rules:
            html = rule[0].sub(rule[1], html)

        if self.regex_wizard_callback is not None:
            self.regex_wizard_callback(self.current_href, html)

        if get_preprocess_html:
            return html

        def dump(raw, where):
            import os
            dp = getattr(self.extra_opts, 'debug_pipeline', None)
            if dp and os.path.exists(dp):
                odir = os.path.join(dp, 'input')
                if os.path.exists(odir):
                    odir = os.path.join(odir, where)
                    if not os.path.exists(odir):
                        os.makedirs(odir)
                    name, i = None, 0
                    while not name or os.path.exists(os.path.join(odir, name)):
                        i += 1
                        name = '%04d.html'%i
                    with open(os.path.join(odir, name), 'wb') as f:
                        f.write(raw.encode('utf-8'))

        # dump(html, 'pre-preprocess')

        for rule in rules + end_rules:
            try:
                html = rule[0].sub(rule[1], html)
            except Exception as e:
                if rule in user_sr_rules:
                    self.log.error(
                        'User supplied search & replace rule: %s -> %s '
                        'failed with error: %s, ignoring.'%(
                            user_sr_rules[rule], rule[1], e))
                else:
                    raise

        if is_pdftohtml and length > -1:
            # Dehyphenate
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html', length)

        if is_pdftohtml:
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            pdf_markup = HeuristicProcessor(self.extra_opts, None)
            totalwords = 0
            if pdf_markup.get_word_count(html) > 7000:
                html = pdf_markup.markup_chapters(html, totalwords, True)

        # dump(html, 'post-preprocess')

        # Handle broken XHTML w/ SVG (ugh)
        if 'svg:' in html and SVG_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
        if 'xlink:' in html and XLINK_NS not in html:
            html = html.replace(
                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)

        html = XMLDECL_RE.sub('', html)

        if getattr(self.extra_opts, 'asciiize', False):
            from calibre.utils.localization import get_udc
            from calibre.utils.mreplace import MReplace
            unihandecoder = get_udc()
            mr = MReplace(data={u'«':u'&lt;'*3, u'»':u'&gt;'*3})
            html = mr.mreplace(html)
            html = unihandecoder.decode(html)

        if getattr(self.extra_opts, 'enable_heuristics', False):
            from calibre.ebooks.conversion.utils import HeuristicProcessor
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)

        if is_pdftohtml:
            html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = smarten_punctuation(html, self.log)

        try:
            unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
        except AttributeError:
            unsupported_unicode_chars = u''
        if unsupported_unicode_chars:
            from calibre.utils.localization import get_udc
            unihandecoder = get_udc()
            for char in unsupported_unicode_chars:
                asciichar = unihandecoder.decode(char)
                html = html.replace(char, asciichar)

        return html
Example #20
0
def similar_title_match(title, lang=None):
    title = get_udc().decode(title)
    result = fuzzy_it(title)
    if lang:
        return lang + result
    return result
Example #21
0
	def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
        log.info("identify")
		'''
		Note this method will retry without identifiers automatically if no
		match is found with identifiers.
		'''
		matches = []
		# If we have an ISFDB id then we do not need to fire a "search".
		# Instead we will go straight to the URL for that book.
		isfdb_id = identifiers.get('isfdb', None)
		isbn = check_isbn(identifiers.get('isbn', None))
		br = self.browser
		if isfdb_id:
			matches.append('%s/cgi-bin/pl.cgi?%s' % (ISFDB.BASE_URL, isfdb_id))
		else:
			title = get_udc().decode(title)
			authors = authors or []
			authors = [get_udc().decode(a) for a in authors]
			query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
			if query is None:
				log.error('Insufficient metadata to construct query. Alas!')
				return
			isbn_match_failed = False
			try:
				log.info('Querying: %s' % query)
				response = br.open_novisit(query, timeout=timeout)
				raw = response.read().decode('cp1252', errors='replace').strip()
				
				if isbn:
					# Check whether we got redirected to a book page for ISBN searches.
					# If we did, will use the url.
					# If we didn't then treat it as no matches on ISFDB
					location = response.geturl()
					# If not an exact match on ISBN we can get a search results page back
					# XMS: This may be terribly different for ISFDB.
					# XMS: HOWEVER: 1563890933 returns multiple results!
					isbn_match_failed = location.find('/pl.cgi') < 0
					if raw.find('found 0 matches') == -1 and not isbn_match_failed:
						log.info('ISBN match location: %r' % location)
						matches.append(location)
			except Exception as e:
				if isbn and callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
					# We did a lookup by ISBN but did not find a match
					# We will fallback to doing a lookup by title author
					log.info('Failed to find match for ISBN: %s' % isbn)
				elif callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
					log.error('No matches for identify query')
					return as_unicode(e)
				else:
					err = 'Failed to make identify query'
					log.exception(err)
					return as_unicode(e)

			# For successful ISBN-based searches we have already done everything we need to.
			# So anything from this point below is for title/author based searches.
			if not isbn or isbn_match_failed:
				try:
					root = fromstring(clean_ascii_chars(raw))
				except:
					msg = 'Failed to parse ISFDB page for query'
					log.exception(msg)
					return msg
				# Now grab the matches from the search results, provided the
				# title and authors appear to be for the same book
				self._parse_search_results(log, title, authors, root, matches, timeout)

		if abort.is_set():
			return

		if not matches:
			if identifiers and title and authors:
				log.info('No matches found with identifiers, retrying using only title and authors')
				return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout)
			log.error('No matches found with query: %r' % query)
			return

		from calibre_plugins.isfdb.worker import Worker
		workers = [Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches)]

		for w in workers:
			w.start()
			# Don't send all requests at the same time
			time.sleep(0.1)

		while not abort.is_set():
			a_worker_is_alive = False
			for w in workers:
				w.join(0.2)
				if abort.is_set():
					break
				if w.is_alive():
					a_worker_is_alive = True
			if not a_worker_is_alive:
				break
		
		return None
Example #22
0
def similar_title_match(title, lang=None):
    title = get_udc().decode(title)
    result = fuzzy_it(title)
    if lang:
        return lang + result
    return result
def similar_title_match(title):
    title = get_udc().decode(title)
    return fuzzy_it(title)