def add_phrase(self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq=0, commit=True): ''' Add phrase to database ''' if DEBUG_LEVEL > 1: LOGGER.debug( 'input_phrase=%s phrase=%s user_freq=%s ', input_phrase.encode('UTF-8'), phrase.encode('UTF-8'), user_freq) if not input_phrase or not phrase: return input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) select_sqlstr = ''' SELECT * FROM user_db.phrases WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase ;''' select_sqlargs = { 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase} if self.database.execute(select_sqlstr, select_sqlargs).fetchall(): # there is already such a phrase, i.e. add_phrase was called # in error, do nothing to avoid duplicate entries. return insert_sqlstr = ''' INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp) VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp) ;''' insert_sqlargs = {'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase, 'user_freq': user_freq, 'timestamp': time.time()} if DEBUG_LEVEL > 1: LOGGER.debug('insert_sqlstr=%s', insert_sqlstr) LOGGER.debug('insert_sqlargs=%s', insert_sqlargs) try: self.database.execute(insert_sqlstr, insert_sqlargs) if commit: self.database.commit() except Exception: LOGGER.exception('Unexpected error adding phrase to database.')
def test_remove_accents(self) -> None: self.assertEqual(itb_util.remove_accents('abcÅøßẞüxyz'), 'abcAossSSuxyz') self.assertEqual( itb_util.remove_accents(unicodedata.normalize( 'NFD', 'abcÅøßẞüxyz')), 'abcAossSSuxyz') self.assertEqual( unicodedata.normalize( 'NFC', itb_util.remove_accents('abcÅøßẞüxyz', keep='åÅØø')), 'abcÅøssSSuxyz') self.assertEqual( unicodedata.normalize( 'NFC', itb_util.remove_accents( unicodedata.normalize('NFD', 'abcÅøßẞüxyz'), keep=unicodedata.normalize('NFD', 'åÅØø'))), 'abcÅøssSSuxyz') self.assertEqual( unicodedata.normalize( 'NFC', itb_util.remove_accents('alkoholförgiftning', keep='åÅÖö')), 'alkoholförgiftning') self.assertEqual( unicodedata.normalize( 'NFC', itb_util.remove_accents( unicodedata.normalize('NFD', 'alkoholförgiftning'), keep=unicodedata.normalize('NFD', 'åÅÖö'))), 'alkoholförgiftning')
def update_phrase(self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq=0, commit=True): ''' update the user frequency of a phrase ''' if not input_phrase or not phrase: return input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize(self._normalization_form_internal, input_phrase) phrase = unicodedata.normalize(self._normalization_form_internal, phrase) p_phrase = unicodedata.normalize(self._normalization_form_internal, p_phrase) pp_phrase = unicodedata.normalize(self._normalization_form_internal, pp_phrase) sqlstr = ''' UPDATE user_db.phrases SET user_freq = :user_freq, timestamp = :timestamp WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase ;''' sqlargs = { 'user_freq': user_freq, 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase, 'timestamp': time.time() } if DEBUG_LEVEL > 1: sys.stderr.write("tabsqlitedb.update_phrase() sqlstr=%s\n" % sqlstr) sys.stderr.write("tabsqlitedb.update_phrase() sqlargs=%s\n" % sqlargs) try: self.db.execute(sqlstr, sqlargs) if commit: self.db.commit() except: traceback.print_exc()
def update_phrase(self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq=0, commit=True): ''' update the user frequency of a phrase ''' if not input_phrase or not phrase: return input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) phrase = unicodedata.normalize(itb_util.NORMALIZATION_FORM_INTERNAL, phrase) p_phrase = unicodedata.normalize(itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize(itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) sqlstr = ''' UPDATE user_db.phrases SET user_freq = :user_freq, timestamp = :timestamp WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase ;''' sqlargs = { 'user_freq': user_freq, 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase, 'timestamp': time.time() } if DEBUG_LEVEL > 1: LOGGER.debug('sqlstr=%s', sqlstr) LOGGER.debug('sqlargs=%s', sqlargs) try: self.database.execute(sqlstr, sqlargs) if commit: self.database.commit() except Exception: LOGGER.exception('Unexpected error updating phrase in user_db.')
def load_dictionary(self): '''Load a hunspell dictionary and instantiate a enchant.Dict() or a hunspell.Hunspell() object. ''' if DEBUG_LEVEL > 0: sys.stderr.write("load_dictionary() ...\n") (self.dic_path, self.encoding, self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name) if self.words: # List of languages where accent insensitive matching makes sense: accent_languages = ( 'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb', 'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo', 'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr', 'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb', 'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds', 'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu', 'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq', 'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz', 've', 'vi', 'wa', 'xh', ) if self.name.split('_')[0] in accent_languages: self.word_pairs = [ (x, itb_util.remove_accents(x)) for x in self.words ] for word in self.words: if len(word) > self.max_word_len: self.max_word_len = len(word) if DEBUG_LEVEL > 1: sys.stderr.write( 'load_dictionary() max_word_len = %s\n' % self.max_word_len) if IMPORT_ENCHANT_SUCCESSFUL: self.enchant_dict = enchant.Dict(self.name) elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path: aff_path = self.dic_path.replace('.dic', '.aff') self.pyhunspell_object = hunspell.HunSpell( self.dic_path, aff_path)
def update_phrase(self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq=0, commit=True): ''' update the user frequency of a phrase ''' if not input_phrase or not phrase: return input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) sqlstr = ''' UPDATE user_db.phrases SET user_freq = :user_freq, timestamp = :timestamp WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase ;''' sqlargs = {'user_freq': user_freq, 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase, 'timestamp': time.time()} if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.update_phrase() sqlstr=%s\n" %sqlstr) sys.stderr.write( "TabSqliteDb.update_phrase() sqlargs=%s\n" %sqlargs) try: self.db.execute(sqlstr, sqlargs) if commit: self.db.commit() except: traceback.print_exc()
def load_dictionary(self): '''Load a hunspell dictionary and instantiate a enchant.Dict() or a hunspell.Hunspell() object. ''' if DEBUG_LEVEL > 0: sys.stderr.write("load_dictionary() ...\n") (self.dic_path, self.encoding, self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name) if self.words: # List of languages where accent insensitive matching makes sense: accent_languages = ( 'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb', 'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo', 'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr', 'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb', 'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds', 'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu', 'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq', 'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz', 've', 'vi', 'wa', 'xh', ) if self.name.split('_')[0] in accent_languages: self.word_pairs = [ (x, itb_util.remove_accents(x)) for x in self.words ] for x in self.words: if len(x) > self.max_word_len: self.max_word_len = len(x) if DEBUG_LEVEL > 1: sys.stderr.write( 'load_dictionary() max_word_len = %s\n' % self.max_word_len) if IMPORT_ENCHANT_SUCCESSFUL: self.enchant_dict = enchant.Dict(self.name) elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path: aff_path = self.dic_path.replace('.dic', '.aff') self.pyhunspell_object = hunspell.HunSpell(self.dic_path, aff_path)
def check_phrase_and_update_frequency( self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq_increment=1, commit=True): ''' Check whether input_phrase and phrase are already in database. If they are in the database, increase the frequency by 1, if not add them. ''' if not input_phrase: input_phrase = phrase if not phrase: return phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) if DEBUG_LEVEL > 1: LOGGER.debug( 'phrase=%(p)s, input_phrase=%(t)s', {'p': phrase.encode('UTF-8'), 't': input_phrase.encode('UTF-8')}) # There should never be more than 1 database row for the same # input_phrase *and* phrase. So the following query on # the database should match at most one database # row and the length of the result array should be 0 or # 1. So the “GROUP BY phrase” is actually redundant. It is # only a safeguard for the case when duplicate rows have been # added to the database accidentally (But in that case there # is a bug somewhere else which should be fixed). sqlstr = ''' SELECT max(user_freq) FROM user_db.phrases WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase GROUP BY phrase ;''' sqlargs = {'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase} if DEBUG_LEVEL > 1: LOGGER.debug( 'TabSqliteDb.check_phrase_and_update_frequency() sqlstr=%s', sqlstr) LOGGER.debug( 'TabSqliteDb.check_phrase_and_update_frequency() sqlargs=%s', sqlargs) result = self.database.execute(sqlstr, sqlargs).fetchall() if DEBUG_LEVEL > 1: LOGGER.debug( 'check_phrase_and_update_frequency() result=%s', result) if result: # A match was found in user_db, increase user frequency by # user_freq_increment (1 by default) self.update_phrase(input_phrase=input_phrase, phrase=phrase, p_phrase=p_phrase, pp_phrase=pp_phrase, user_freq=result[0][0]+user_freq_increment, commit=commit) return # The phrase was not found in user_db. # Add it as a new phrase, i.e. with user_freq = user_freq_increment # (1 by default): self.add_phrase(input_phrase=input_phrase, phrase=phrase, p_phrase=p_phrase, pp_phrase=pp_phrase, user_freq=user_freq_increment, commit=commit) return
def select_words(self, input_phrase, p_phrase='', pp_phrase=''): ''' Get phrases from database completing input_phrase. Returns a list of matches where each match is a tuple in the form of (phrase, user_freq), i.e. returns something like [(phrase, user_freq), ...] ''' input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) if DEBUG_LEVEL > 1: LOGGER.debug( 'input_phrase=%s p_phrase=%s pp_phrase=%s', input_phrase.encode('UTF-8'), p_phrase.encode('UTF-8'), pp_phrase.encode('UTF-8')) phrase_frequencies = {} if not ' ' in input_phrase: # Get suggestions from hunspell dictionaries. But only # if input_phrase does not contain spaces. The hunspell # dictionaries contain only single words, not sentences. # Trying to complete an input_phrase which contains spaces # will never work and spell checking suggestions by hunspell # for input which contains spaces is almost always nonsense. phrase_frequencies.update([ x for x in self.hunspell_obj.suggest(input_phrase)]) if DEBUG_LEVEL > 1: LOGGER.debug( 'hunspell: best_candidates=%s', self.best_candidates(phrase_frequencies)) # Remove the accents *after* getting the hunspell candidates. # If the accents were removed before getting the hunspell candidates # an input phrase like “Glühwürmchen” would not be added as a # candidate because hunspell would get “Gluhwurmchen” then and would # not validate that as a correct word. And, because “Glühwürmchen” # is not in the German hunspell dictionary as a single word but # created by suffix and prefix rules, the accent insensitive match # in the German hunspell dictionary would not find it either. input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) # Now phrase_frequencies might contain something like this: # # {'code': 0, 'communicability': 0, 'cold': 0, 'colour': 0} # To quote a string to be used as a parameter when assembling # an sqlite statement with Python string operations, remove # all NUL characters, replace " with "" and wrap the whole # string in double quotes. Assembling sqlite statements using # parameters containing user input with python string operations # is not recommended because of the risk of SQL injection attacks # if the quoting is not done the right way. So it is better to use # the parameter substitution of the sqlite3 python interface. # But unfortunately that does not work when creating views, # (“OperationalError: parameters are not allowed in views”). quoted_input_phrase = input_phrase.replace( '\x00', '').replace('"', '""') self.database.execute('DROP VIEW IF EXISTS like_input_phrase_view;') sqlstr = ''' CREATE TEMPORARY VIEW IF NOT EXISTS like_input_phrase_view AS SELECT * FROM user_db.phrases WHERE input_phrase LIKE "%(quoted_input_phrase)s%%" ;''' % {'quoted_input_phrase': quoted_input_phrase} self.database.execute(sqlstr) sqlargs = {'p_phrase': p_phrase, 'pp_phrase': pp_phrase} sqlstr = ( 'SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' + 'GROUP BY phrase;') try: # Get “unigram” data from user_db. # # Example: Let’s assume the user typed “co” and user_db contains # # 1|colou|colour|green|nice|1 # 2|col|colour|yellow|ugly|2 # 3|co|colour|green|awesome|1 # 4|co|cold|||1 # 5|conspirac|conspiracy|||5 # 6|conspi|conspiracy|||1 # 7|c|conspiracy|||1 results_uni = self.database.execute(sqlstr, sqlargs).fetchall() # Then the result returned by .fetchall() is: # # [('colour', 4), ('cold', 1), ('conspiracy', 6)] # # (“c|conspiracy|1” is not selected because it doesn’t # match the user input “LIKE co%”! I.e. this is filtered # out by the VIEW created above already) except Exception: LOGGER.exception( 'Unexpected error getting “unigram” data from user_db.') if not results_uni: # If no unigrams matched, bigrams and trigrams cannot # match either. We can stop here and return what we got # from hunspell. return self.best_candidates(phrase_frequencies) # Now normalize the unigram frequencies with the total count # (which is 11 in the above example), which gives us the # normalized result: # [('colour', 4/11), ('cold', 1/11), ('conspiracy', 6/11)] sqlstr = 'SELECT sum(user_freq) FROM like_input_phrase_view;' try: count = self.database.execute(sqlstr, sqlargs).fetchall()[0][0] except Exception: LOGGER.exception( 'Unexpected error getting total unigram count from user_db') # Updating the phrase_frequency dictionary with the normalized # results gives: {'conspiracy': 6/11, 'code': 0, # 'communicability': 0, 'cold': 1/11, 'colour': 4/11} for result_uni in results_uni: phrase_frequencies.update( [(result_uni[0], result_uni[1]/float(count))]) if DEBUG_LEVEL > 1: LOGGER.debug( 'Unigram best_candidates=%s', self.best_candidates(phrase_frequencies)) if not p_phrase: # If no context for bigram matching is available, return # what we have so far: return self.best_candidates(phrase_frequencies) sqlstr = ( 'SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase GROUP BY phrase;') try: results_bi = self.database.execute(sqlstr, sqlargs).fetchall() except Exception: LOGGER.exception( 'Unexpected error getting “bigram” data from user_db') if not results_bi: # If no bigram could be matched, return what we have so far: return self.best_candidates(phrase_frequencies) # get the total count of p_phrase to normalize the bigram frequencies: sqlstr = ( 'SELECT sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase;') try: count_p_phrase = self.database.execute( sqlstr, sqlargs).fetchall()[0][0] except Exception: LOGGER.exception( 'Unexpected error getting total bigram count from user_db') # Update the phrase frequency dictionary by using a linear # combination of the unigram and the bigram results, giving # both the weight of 0.5: for result_bi in results_bi: phrase_frequencies.update( [(result_bi[0], 0.5*result_bi[1]/float(count_p_phrase) +0.5*phrase_frequencies[result_bi[0]])]) if DEBUG_LEVEL > 1: LOGGER.debug( 'Bigram best_candidates=%s', self.best_candidates(phrase_frequencies)) if not pp_phrase: # If no context for trigram matching is available, return # what we have so far: return self.best_candidates(phrase_frequencies) sqlstr = ('SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase ' + 'AND pp_phrase = :pp_phrase GROUP BY phrase;') try: results_tri = self.database.execute(sqlstr, sqlargs).fetchall() except Exception: LOGGER.exception( 'Unexpected error getting “trigram” data from user_db') if not results_tri: # if no trigram could be matched, return what we have so far: return self.best_candidates(phrase_frequencies) # get the total count of (p_phrase, pp_phrase) pairs to # normalize the bigram frequencies: sqlstr = ( 'SELECT sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase AND pp_phrase = :pp_phrase;') try: count_pp_phrase_p_phrase = self.database.execute( sqlstr, sqlargs).fetchall()[0][0] except Exception: LOGGER.exception( 'Unexpected error getting total trigram count from user_db') # Update the phrase frequency dictionary by using a linear # combination of the bigram and the trigram results, giving # both the weight of 0.5 (that makes the total weights: 0.25 * # unigram + 0.25 * bigram + 0.5 * trigram, i.e. the trigrams # get higher weight): for result_tri in results_tri: phrase_frequencies.update( [(result_tri[0], 0.5*result_tri[1]/float(count_pp_phrase_p_phrase) +0.5*phrase_frequencies[result_tri[0]])]) if DEBUG_LEVEL > 1: LOGGER.debug( 'Trigram best_candidates=%s', self.best_candidates(phrase_frequencies)) return self.best_candidates(phrase_frequencies)
def load_dictionary(self): '''Load a hunspell dictionary and instantiate a enchant.Dict() or a hunspell.Hunspell() object. ''' if DEBUG_LEVEL > 0: LOGGER.debug('load_dictionary() ...\n') (self.dic_path, self.encoding, self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name) if self.words: # List of languages where accent insensitive matching makes sense: accent_languages = ( 'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb', 'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fi', 'fo', 'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr', 'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb', 'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds', 'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu', 'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq', 'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz', 've', 'vi', 'wa', 'xh', ) if self.name.split('_')[0] in accent_languages: self.word_pairs = [(x, itb_util.remove_accents(x)) for x in self.words] for word in self.words: if len(word) > self.max_word_len: self.max_word_len = len(word) if DEBUG_LEVEL > 1: LOGGER.debug('max_word_len = %s\n', self.max_word_len) if self.name.split('_')[0] == 'fi': self.enchant_dict = None self.pyhunspell_object = None if IMPORT_LIBVOIKKO_SUCCESSFUL: self.voikko = libvoikko.Voikko('fi') return if IMPORT_ENCHANT_SUCCESSFUL: try: self.enchant_dict = enchant.Dict(self.name) except enchant.errors.DictNotFoundError: LOGGER.exception('Error initializing enchant for %s', self.name) self.enchant_dict = None except Exception: LOGGER.exception( 'Unknown error initializing enchant for %s', self.name) self.enchant_dict = None elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path: aff_path = self.dic_path.replace('.dic', '.aff') try: self.pyhunspell_object = hunspell.HunSpell( self.dic_path, aff_path) except hunspell.HunSpellError: LOGGER.debug('Error initializing hunspell for %s', self.name) self.pyhunspell_object = None except Exception: LOGGER.debug('Unknown error initializing hunspell for %s', self.name) self.pyhunspell_object = None
def load_dictionary(self): '''Load a hunspell dictionary and instantiate a enchant.Dict() or a hunspell.Hunspell() object. ''' if DEBUG_LEVEL > 0: sys.stderr.write("load_dictionary() ...\n") dic_path = os.path.join(self.loc, self.name + '.dic') aff_path = os.path.join(self.loc, self.name + '.aff') if not os.path.isfile(dic_path) or not os.path.isfile(aff_path): sys.stderr.write( "load_dictionary %(n)s: %(d)s %(a)s file missing.\n" % { 'n': self.name, 'd': dic_path, 'a': aff_path }) return aff_buffer = None dic_buffer = None try: aff_buffer = open(aff_path, mode='r', encoding='ISO-8859-1', errors='ignore').read().replace('\r\n', '\n') except (FileNotFoundError, PermissionError): traceback.print_exc() except: sys.stderr.write('Unexpected error loading .aff File: %s\n' % aff_path) traceback.print_exc() if aff_buffer: encoding_pattern = re.compile( r'^[\s]*SET[\s]+(?P<encoding>[-a-zA-Z0-9_]+)[\s]*$', re.MULTILINE) match = encoding_pattern.search(aff_buffer) if match: self.encoding = match.group('encoding') if DEBUG_LEVEL > 0: sys.stderr.write( "load_dictionary(): encoding=%(enc)s found in %(aff)s" % { 'enc': self.encoding, 'aff': aff_path }) try: dic_buffer = open(dic_path, encoding=self.encoding).readlines() except (UnicodeDecodeError, FileNotFoundError, PermissionError): if DEBUG_LEVEL > 0: sys.stderr.write( "load_dictionary(): " + "loading %(dic)s as %(enc)s encoding failed, " % { 'dic': dic_path, 'enc': self.encoding } + "fall back to ISO-8859-1.\n") self.encoding = 'ISO-8859-1' try: dic_buffer = open(dic_path, encoding=self.encoding).readlines() except (UnicodeDecodeError, FileNotFoundError, PermissionError): sys.stderr.write( "load_dictionary(): " + "loading %(dic)s as %(enc)s encoding failed, " % { 'dic': dic_path, 'enc': self.encoding } + "giving up.\n") dic_buffer = None traceback.print_exc() return except: sys.stderr.write('Unexpected error loading .dic File: %s\n' % dic_path) traceback.print_exc() return except: sys.stderr.write('Unexpected error loading .dic File: %s\n' % dic_path) traceback.print_exc() return if dic_buffer: if DEBUG_LEVEL > 0: sys.stderr.write( "load_dictionary(): " + "Successfully loaded %(dic)s using %(enc)s encoding.\n" % { 'dic': dic_path, 'enc': self.encoding }) # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says: # # > A dictionary file (*.dic) contains a list of words, one per # > line. The first line of the dictionaries (except personal # > dictionaries) contains the word count. Each word may # > optionally be followed by a slash ("/") and one or more # > flags, which represents affixes or special attributes. # # Therefore, remove '/' and the following flags from each # line to make the buffer a bit smaller and the regular # expressions we use later to match words in the # dictionary slightly simpler and maybe a tiny bit faster: self.words = [ unicodedata.normalize(NORMALIZATION_FORM_INTERNAL, re.sub(r'/.*', '', x.replace('\n', ''))) for x in dic_buffer ] # List of languages where accent insensitive matching makes sense: accent_languages = ( 'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb', 'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo', 'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr', 'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb', 'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds', 'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu', 'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq', 'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz', 've', 'vi', 'wa', 'xh', ) if self.name.split('_')[0] in accent_languages: self.word_pairs = [(x, itb_util.remove_accents(x)) for x in self.words] if IMPORT_ENCHANT_SUCCESSFUL: self.enchant_dict = enchant.Dict(self.name) elif IMPORT_HUNSPELL_SUCCESSFUL: self.pyhunspell_object = hunspell.HunSpell(dic_path, aff_path)
def add_phrase(self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq=0, commit=True): ''' Add phrase to database ''' if DEBUG_LEVEL > 1: sys.stderr.write("tabsqlitedb.add_phrase() " + "input_phrase=%s " % input_phrase.encode('UTF-8') + "phrase=%s " % phrase.encode('UTF-8') + "user_freq=%s " % user_freq) if not input_phrase or not phrase: return input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize(self._normalization_form_internal, input_phrase) phrase = unicodedata.normalize(self._normalization_form_internal, phrase) p_phrase = unicodedata.normalize(self._normalization_form_internal, p_phrase) pp_phrase = unicodedata.normalize(self._normalization_form_internal, pp_phrase) select_sqlstr = ''' SELECT * FROM user_db.phrases WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase ;''' select_sqlargs = { 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase } if self.db.execute(select_sqlstr, select_sqlargs).fetchall(): # there is already such a phrase, i.e. add_phrase was called # in error, do nothing to avoid duplicate entries. return insert_sqlstr = ''' INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp) VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp) ;''' insert_sqlargs = { 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase, 'user_freq': user_freq, 'timestamp': time.time() } if DEBUG_LEVEL > 1: sys.stderr.write("tabsqlitedb.add_phrase() insert_sqlstr=%s\n" % insert_sqlstr) sys.stderr.write("tabsqlitedb.add_phrase() insert_sqlargs=%s\n" % insert_sqlargs) try: self.db.execute(insert_sqlstr, insert_sqlargs) if commit: self.db.commit() except Exception: traceback.print_exc()
def suggest(self, input_phrase): # pylint: disable=line-too-long '''Return completions or corrections for the input phrase :param input_phrase: A string to find completions or corrections for :type input_phrase: String :rtype: A list of tuples of the form (<word>, <score>) <score> can have these values: 0: This is a completion, i.e. input_phrase matches the beginning of <word> (accent insensitive match) -1: This is a spell checking correction from hunspell (i.e. either from enchant or pyhunspell) Examples: (Attention, the return values are in internal normalization form ('NFD')) >>> h = Hunspell(['de_DE', 'cs_CZ']) >>> h.suggest('Geschwindigkeitsubertre')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Glühwürmchen')[0] ('Glühwürmchen', 0) >>> h.suggest('Alpengluhen')[0] ('Alpenglühen', 0) >>> h.suggest('filosofictejs')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtější')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtějš')[0] ('filosofičtější', 0) >>> h = Hunspell(['it_IT']) >>> h.suggest('principianti') [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)] >>> h = Hunspell(['es_ES']) >>> h.suggest('teneis') [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)] >>> h.suggest('tenéis')[0] ('tenéis', 0) >>> h = Hunspell(['en_US']) >>> h.suggest('camel') [('camel', 0), ('camellia', 0), ('camelhair', 0), ('came', -1), ('Camel', -1), ('cameo', -1), ('came l', -1), ('camels', -1)] >>> h = Hunspell(['fr_FR']) >>> h.suggest('differemmen') [('différemment', 0)] >>> h = Hunspell(['None']) >>> h.suggest('camel') [] >>> h = Hunspell(['None', 'en_US']) >>> h.suggest('camel') [('camel', 0), ('camellia', 0), ('camelhair', 0), ('came', -1), ('Camel', -1), ('cameo', -1), ('came l', -1), ('camels', -1)] ''' # pylint: enable=line-too-long if input_phrase in self._suggest_cache: return self._suggest_cache[input_phrase] if DEBUG_LEVEL > 1: LOGGER.debug("Hunspell.suggest() input_phrase=%(ip)s\n", {'ip': input_phrase.encode('UTF-8')}) # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says: # # > A dictionary file (*.dic) contains a list of words, one per # > line. The first line of the dictionaries (except personal # > dictionaries) contains the word count. Each word may # > optionally be followed by a slash ("/") and one or more # > flags, which represents affixes or special attributes. # # I.e. if '/' is already contained in the input, it cannot # match a word in the dictionary and we return an empty list # immediately: if '/' in input_phrase: self._suggest_cache[input_phrase] = [] return [] # make sure input_phrase is in the internal normalization form (NFD): input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) input_phrase_no_accents = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, itb_util.remove_accents(input_phrase)) # But enchant and pyhunspell want NFC as input, make a copy in NFC: input_phrase_nfc = unicodedata.normalize('NFC', input_phrase) suggested_words = {} for dictionary in self._dictionaries: if dictionary.words: # If the input phrase is longer than than the maximum # word length in a dictionary, don’t try # complete it, it just wastes time then. if len(input_phrase) <= dictionary.max_word_len: if dictionary.word_pairs: suggested_words.update([ (x[0], 0) for x in dictionary.word_pairs if x[1].startswith(input_phrase_no_accents) ]) else: suggested_words.update([(x, 0) for x in dictionary.words if x.startswith(input_phrase)]) if len(input_phrase) >= 4: if dictionary.spellcheck(input_phrase): # This is a valid word in this dictionary. # It might have been missed by the # matching above because the dictionary # might not contain all possible word # forms (The prefix and suffix information # has been ignored). But the spell checker # knows about this, if the spell checker # thinks it is a correct word, it must be # counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.spellcheck_suggest(input_phrase) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words ]) for word in suggested_words: if (suggested_words[word] == -1 and itb_util.remove_accents(word) == itb_util.remove_accents(input_phrase)): # This spell checking correction is actually even # an accent insensitive match, adjust accordingly: suggested_words[word] = 0 sorted_suggestions = sorted( suggested_words.items(), key=lambda x: ( -x[1], # 0: in dictionary, -1: hunspell len(x[0]), # length of word ascending x[0], # alphabetical ))[0:MAX_WORDS] self._suggest_cache[input_phrase] = sorted_suggestions return sorted_suggestions
def check_phrase_and_update_frequency( self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq_increment=1, commit=True): ''' Check whether input_phrase and phrase are already in database. If they are in the database, increase the frequency by 1, if not add them. ''' if not input_phrase: input_phrase = phrase if not phrase: return phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.check_phrase_and_update_frequency() " + "phrase=%(p)s, input_phrase=%(t)s\n" %{'p': phrase.encode('UTF-8'), 't': input_phrase.encode('UTF-8')}) # There should never be more than 1 database row for the same # input_phrase *and* phrase. So the following query on # the database should match at most one database # row and the length of the result array should be 0 or # 1. So the “GROUP BY phrase” is actually redundant. It is # only a safeguard for the case when duplicate rows have been # added to the database accidentally (But in that case there # is a bug somewhere else which should be fixed). sqlstr = ''' SELECT max(user_freq) FROM user_db.phrases WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase GROUP BY phrase ;''' sqlargs = {'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase} if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.check_phrase_and_update_frequency() sqlstr=%s\n" %sqlstr) sys.stderr.write( "TabSqliteDb.check_phrase_and_update_frequency() sqlargs=%s\n" %sqlargs) result = self.db.execute(sqlstr, sqlargs).fetchall() if DEBUG_LEVEL > 1: sys.stderr.write( "check_phrase_and_update_frequency() result=%s\n" %result) if result: # A match was found in user_db, increase user frequency by # user_freq_increment (1 by default) self.update_phrase(input_phrase=input_phrase, phrase=phrase, p_phrase=p_phrase, pp_phrase=pp_phrase, user_freq=result[0][0]+user_freq_increment, commit=commit) return # The phrase was not found in user_db. # Add it as a new phrase, i.e. with user_freq = user_freq_increment # (1 by default): self.add_phrase(input_phrase=input_phrase, phrase=phrase, p_phrase=p_phrase, pp_phrase=pp_phrase, user_freq=user_freq_increment, commit=commit) return
def select_words(self, input_phrase, p_phrase='', pp_phrase=''): ''' Get phrases from database completing input_phrase. Returns a list of matches where each match is a tuple in the form of (phrase, user_freq), i.e. returns something like [(phrase, user_freq), ...] ''' input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.select_words() " + "input_phrase=%s " % input_phrase.encode('UTF-8') + "p_phrase=%s " % p_phrase.encode('UTF-8') + "pp_phrase=%s\n" % pp_phrase.encode('UTF-8')) phrase_frequencies = {} if not ' ' in input_phrase: # Get suggestions from hunspell dictionaries. But only # if input_phrase does not contain spaces. The hunspell # dictionaries contain only single words, not sentences. # Trying to complete an input_phrase which contains spaces # will never work and spell checking suggestions by hunspell # for input which contains spaces is almost always nonsense. phrase_frequencies.update([ x for x in self.hunspell_obj.suggest(input_phrase)]) if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.select_words() hunspell: best_candidates=%s\n" %self.best_candidates(phrase_frequencies)) # Remove the accents *after* getting the hunspell candidates. # If the accents were removed before getting the hunspell candidates # an input phrase like “Glühwürmchen” would not be added as a # candidate because hunspell would get “Gluhwurmchen” then and would # not validate that as a correct word. And, because “Glühwürmchen” # is not in the German hunspell dictionary as a single word but # created by suffix and prefix rules, the accent insensitive match # in the German hunspell dictionary would not find it either. input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) # Now phrase_frequencies might contain something like this: # # {'code': 0, 'communicability': 0, 'cold': 0, 'colour': 0} # To quote a string to be used as a parameter when assembling # an sqlite statement with Python string operations, remove # all NUL characters, replace " with "" and wrap the whole # string in double quotes. Assembling sqlite statements using # parameters containing user input with python string operations # is not recommended because of the risk of SQL injection attacks # if the quoting is not done the right way. So it is better to use # the parameter substitution of the sqlite3 python interface. # But unfortunately that does not work when creating views, # (“OperationalError: parameters are not allowed in views”). quoted_input_phrase = input_phrase.replace( '\x00', '').replace('"', '""') self.db.execute('DROP VIEW IF EXISTS like_input_phrase_view;') sqlstr = ''' CREATE TEMPORARY VIEW IF NOT EXISTS like_input_phrase_view AS SELECT * FROM user_db.phrases WHERE input_phrase LIKE "%(quoted_input_phrase)s%%" ;''' % {'quoted_input_phrase': quoted_input_phrase} self.db.execute(sqlstr) sqlargs = {'p_phrase': p_phrase, 'pp_phrase': pp_phrase} sqlstr = ( 'SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' + 'GROUP BY phrase;') try: # Get “unigram” data from user_db. # # Example: Let’s assume the user typed “co” and user_db contains # # 1|colou|colour|green|nice|1 # 2|col|colour|yellow|ugly|2 # 3|co|colour|green|awesome|1 # 4|co|cold|||1 # 5|conspirac|conspiracy|||5 # 6|conspi|conspiracy|||1 # 7|c|conspiracy|||1 results_uni = self.db.execute(sqlstr, sqlargs).fetchall() # Then the result returned by .fetchall() is: # # [('colour', 4), ('cold', 1), ('conspiracy', 6)] # # (“c|conspiracy|1” is not selected because it doesn’t # match the user input “LIKE co%”! I.e. this is filtered # out by the VIEW created above already) except: traceback.print_exc() if not results_uni: # If no unigrams matched, bigrams and trigrams cannot # match either. We can stop here and return what we got # from hunspell. return self.best_candidates(phrase_frequencies) # Now normalize the unigram frequencies with the total count # (which is 11 in the above example), which gives us the # normalized result: # [('colour', 4/11), ('cold', 1/11), ('conspiracy', 6/11)] sqlstr = 'SELECT sum(user_freq) FROM like_input_phrase_view;' try: count = self.db.execute(sqlstr, sqlargs).fetchall()[0][0] except: traceback.print_exc() # Updating the phrase_frequency dictionary with the normalized # results gives: {'conspiracy': 6/11, 'code': 0, # 'communicability': 0, 'cold': 1/11, 'colour': 4/11} for x in results_uni: phrase_frequencies.update([(x[0], x[1]/float(count))]) if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.select_words() Unigram best_candidates=%s\n" %self.best_candidates(phrase_frequencies)) if not p_phrase: # If no context for bigram matching is available, return # what we have so far: return self.best_candidates(phrase_frequencies) sqlstr = ( 'SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase GROUP BY phrase;') try: results_bi = self.db.execute(sqlstr, sqlargs).fetchall() except: traceback.print_exc() if not results_bi: # If no bigram could be matched, return what we have so far: return self.best_candidates(phrase_frequencies) # get the total count of p_phrase to normalize the bigram frequencies: sqlstr = ( 'SELECT sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase;') try: count_p_phrase = self.db.execute(sqlstr, sqlargs).fetchall()[0][0] except: traceback.print_exc() # Update the phrase frequency dictionary by using a linear # combination of the unigram and the bigram results, giving # both the weight of 0.5: for x in results_bi: phrase_frequencies.update( [(x[0], 0.5*x[1]/float(count_p_phrase) +0.5*phrase_frequencies[x[0]])]) if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.select_words() Bigram best_candidates=%s\n" %self.best_candidates(phrase_frequencies)) if not pp_phrase: # If no context for trigram matching is available, return # what we have so far: return self.best_candidates(phrase_frequencies) sqlstr = ('SELECT phrase, sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase ' + 'AND pp_phrase = :pp_phrase GROUP BY phrase;') try: results_tri = self.db.execute(sqlstr, sqlargs).fetchall() except: traceback.print_exc() if not results_tri: # if no trigram could be matched, return what we have so far: return self.best_candidates(phrase_frequencies) # get the total count of (p_phrase, pp_phrase) pairs to # normalize the bigram frequencies: sqlstr = ( 'SELECT sum(user_freq) FROM like_input_phrase_view ' + 'WHERE p_phrase = :p_phrase AND pp_phrase = :pp_phrase;') try: count_pp_phrase_p_phrase = self.db.execute( sqlstr, sqlargs).fetchall()[0][0] except: traceback.print_exc() # Update the phrase frequency dictionary by using a linear # combination of the bigram and the trigram results, giving # both the weight of 0.5 (that makes the total weights: 0.25 * # unigram + 0.25 * bigram + 0.5 * trigram, i.e. the trigrams # get higher weight): for x in results_tri: phrase_frequencies.update( [(x[0], 0.5*x[1]/float(count_pp_phrase_p_phrase) +0.5*phrase_frequencies[x[0]])]) if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.select_words() Trigram best_candidates=%s\n" %self.best_candidates(phrase_frequencies)) return self.best_candidates(phrase_frequencies)
def add_phrase(self, input_phrase='', phrase='', p_phrase='', pp_phrase='', user_freq=0, commit=True): ''' Add phrase to database ''' if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.add_phrase() " + "input_phrase=%s " % input_phrase.encode('UTF-8') + "phrase=%s " % phrase.encode('UTF-8') + "user_freq=%s " % user_freq ) if not input_phrase or not phrase: return input_phrase = itb_util.remove_accents(input_phrase) input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, phrase) p_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, p_phrase) pp_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, pp_phrase) select_sqlstr = ''' SELECT * FROM user_db.phrases WHERE input_phrase = :input_phrase AND phrase = :phrase AND p_phrase = :p_phrase AND pp_phrase = :pp_phrase ;''' select_sqlargs = { 'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase} if self.db.execute(select_sqlstr, select_sqlargs).fetchall(): # there is already such a phrase, i.e. add_phrase was called # in error, do nothing to avoid duplicate entries. return insert_sqlstr = ''' INSERT INTO user_db.phrases (input_phrase, phrase, p_phrase, pp_phrase, user_freq, timestamp) VALUES (:input_phrase, :phrase, :p_phrase, :pp_phrase, :user_freq, :timestamp) ;''' insert_sqlargs = {'input_phrase': input_phrase, 'phrase': phrase, 'p_phrase': p_phrase, 'pp_phrase': pp_phrase, 'user_freq': user_freq, 'timestamp': time.time()} if DEBUG_LEVEL > 1: sys.stderr.write( "TabSqliteDb.add_phrase() insert_sqlstr=%s\n" %insert_sqlstr) sys.stderr.write( "TabSqliteDb.add_phrase() insert_sqlargs=%s\n" %insert_sqlargs) try: self.db.execute(insert_sqlstr, insert_sqlargs) if commit: self.db.commit() except Exception: traceback.print_exc()
def suggest(self, input_phrase): '''Return completions or corrections for the input phrase :param input_phrase: A string to find completions or corrections for :type input_phrase: String :rtype: A list of tuples of the form (<word>, <score>) <score> can have these values: 0: This is a completion, i.e. input_phrase matches the beginning of <word> (accent insensitive match) -1: This is a spell checking correction from hunspell (i.e. either from enchant or pyhunspell) Examples: (Attention, the return values are in internal normalization form ('NFD')) >>> h = Hunspell(['de_DE', 'cs_CZ']) >>> h.suggest('Geschwindigkeitsubertre')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Glühwürmchen')[0] ('Glühwürmchen', 0) >>> h.suggest('Alpengluhen')[0] ('Alpenglühen', 0) >>> h.suggest('filosofictejsi') [('filosofičtější', 0), ('filosofičtěji', -1)] >>> h.suggest('filosofictejs')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtější')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtějš')[0] ('filosofičtější', 0) >>> h = Hunspell(['it_IT']) >>> h.suggest('principianti') [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)] >>> h = Hunspell(['es_ES']) >>> h.suggest('teneis') [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)] >>> h.suggest('tenéis')[0] ('tenéis', 0) ''' if input_phrase in self._suggest_cache: return self._suggest_cache[input_phrase] if DEBUG_LEVEL > 1: sys.stderr.write("Hunspell.suggest() input_phrase=%(ip)s\n" % {'ip': input_phrase.encode('UTF-8')}) # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says: # # > A dictionary file (*.dic) contains a list of words, one per # > line. The first line of the dictionaries (except personal # > dictionaries) contains the word count. Each word may # > optionally be followed by a slash ("/") and one or more # > flags, which represents affixes or special attributes. # # I.e. if '/' is already contained in the input, it cannot # match a word in the dictionary and we return an empty list # immediately: if '/' in input_phrase: self._suggest_cache[input_phrase] = [] return [] # make sure input_phrase is in the internal normalization form (NFD): input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) input_phrase_no_accents = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, itb_util.remove_accents(input_phrase)) # But enchant and pyhunspell want NFC as input, make a copy in NFC: input_phrase_nfc = unicodedata.normalize('NFC', input_phrase) suggested_words = {} for dictionary in self._dictionaries: if dictionary.words: # If the input phrase is longer than than the maximum # word length in a dictionary, don’t try # complete it, it just wastes time then. if len(input_phrase) <= dictionary.max_word_len: if dictionary.word_pairs: suggested_words.update([ (x[0], 0) for x in dictionary.word_pairs if x[1].startswith(input_phrase_no_accents) ]) else: suggested_words.update([(x, 0) for x in dictionary.words if x.startswith(input_phrase)]) if dictionary.enchant_dict: if len(input_phrase) >= 4: # Always pass NFC to enchant and convert the # result back to the internal normalization # form (NFD) (enchant does the right thing for # Korean if the input is NFC). enchant takes # unicode strings and returns unicode strings, # no encoding and decoding to and from the # hunspell dictionary encoding is necessary # (neither for Python2 nor Python3). # (pyhunspell needs to get its input passed # in dictionary encoding and also returns it # in dictionary encoding). if dictionary.enchant_dict.check(input_phrase_nfc): # This is a valid word in this dictionary. # It might have been missed by the matching # above because the dictionary might not # contain all possible word forms (The # prefix and suffix information has been # ignored). But hunspell knows about this, # if hunspell thinks it is a correct word, # it must be counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.enchant_dict.suggest( input_phrase_nfc) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words ]) elif dictionary.pyhunspell_object: if len(input_phrase) >= 4: # Always pass NFC to pyhunspell and convert # the result back to the internal # normalization form (NFD) (hunspell does the # right thing for Korean if the input is NFC). if dictionary.pyhunspell_object.spell( input_phrase_nfc.encode( dictionary.encoding, 'replace')): # This is a valid word in this dictionary. # It might have been missed by the matching # above because the dictionary might not # contain all possible word forms (The # prefix and suffix information has been # ignored). But hunspell knows about this, # if hunspell thinks it is a correct word, # it must be counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.pyhunspell_object.suggest( input_phrase_nfc.encode( dictionary.encoding, 'replace')) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words ]) else: if (dictionary.name[:2] not in ('ja', 'ja_JP', 'zh', 'zh_CN', 'zh_TW', 'zh_MO', 'zh_SG')): # For some languages, hunspell dictionaries don’t # exist because hunspell makes no sense for these # languages. In these cases, just ignore that the # hunspell dictionary is missing. With the # appropriate input method added, emoji can be # matched nevertheless. suggested_words.update([ ('☹ %(name)s dictionary not found. ' % { 'name': dictionary.name } + 'Please install hunspell dictionary!', 0) ]) for word in suggested_words: if (suggested_words[word] == -1 and itb_util.remove_accents(word) == itb_util.remove_accents(input_phrase)): # This spell checking correction is actually even # an accent insensitive match, adjust accordingly: suggested_words[word] = 0 sorted_suggestions = sorted( suggested_words.items(), key=lambda x: ( -x[1], # 0: in dictionary, -1: hunspell len(x[0]), # length of word ascending x[0], # alphabetical ))[0:MAX_WORDS] self._suggest_cache[input_phrase] = sorted_suggestions return sorted_suggestions
def suggest(self, input_phrase): # pylint: disable=line-too-long '''Return completions or corrections for the input phrase :param input_phrase: A string to find completions or corrections for :type input_phrase: String :rtype: A list of tuples of the form (<word>, <score>) <score> can have these values: 0: This is a completion, i.e. input_phrase matches the beginning of <word> (accent insensitive match) -1: This is a spell checking correction from hunspell (i.e. either from enchant or pyhunspell) Examples: (Attention, the return values are in internal normalization form ('NFD')) >>> h = Hunspell(['de_DE', 'cs_CZ']) >>> h.suggest('Geschwindigkeitsubertre')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Glühwürmchen')[0] ('Glühwürmchen', 0) >>> h.suggest('Alpengluhen')[0] ('Alpenglühen', 0) >>> h.suggest('filosofictejsi') [('filosofičtější', 0), ('filosofičtěji', -1)] >>> h.suggest('filosofictejs')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtější')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtějš')[0] ('filosofičtější', 0) >>> h = Hunspell(['it_IT']) >>> h.suggest('principianti') [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)] >>> h = Hunspell(['es_ES']) >>> h.suggest('teneis') [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)] >>> h.suggest('tenéis')[0] ('tenéis', 0) ''' # pylint: enable=line-too-long if input_phrase in self._suggest_cache: return self._suggest_cache[input_phrase] if DEBUG_LEVEL > 1: sys.stderr.write( "Hunspell.suggest() input_phrase=%(ip)s\n" %{'ip': input_phrase.encode('UTF-8')}) # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says: # # > A dictionary file (*.dic) contains a list of words, one per # > line. The first line of the dictionaries (except personal # > dictionaries) contains the word count. Each word may # > optionally be followed by a slash ("/") and one or more # > flags, which represents affixes or special attributes. # # I.e. if '/' is already contained in the input, it cannot # match a word in the dictionary and we return an empty list # immediately: if '/' in input_phrase: self._suggest_cache[input_phrase] = [] return [] # make sure input_phrase is in the internal normalization form (NFD): input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) input_phrase_no_accents = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, itb_util.remove_accents(input_phrase)) # But enchant and pyhunspell want NFC as input, make a copy in NFC: input_phrase_nfc = unicodedata.normalize('NFC', input_phrase) suggested_words = {} for dictionary in self._dictionaries: if dictionary.words: # If the input phrase is longer than than the maximum # word length in a dictionary, don’t try # complete it, it just wastes time then. if len(input_phrase) <= dictionary.max_word_len: if dictionary.word_pairs: suggested_words.update([ (x[0], 0) for x in dictionary.word_pairs if x[1].startswith(input_phrase_no_accents)]) else: suggested_words.update([ (x, 0) for x in dictionary.words if x.startswith(input_phrase)]) if dictionary.enchant_dict: if len(input_phrase) >= 4: # Always pass NFC to enchant and convert the # result back to the internal normalization # form (NFD) (enchant does the right thing for # Korean if the input is NFC). enchant takes # unicode strings and returns unicode strings, # no encoding and decoding to and from the # hunspell dictionary encoding is necessary # (neither for Python2 nor Python3). # (pyhunspell needs to get its input passed # in dictionary encoding and also returns it # in dictionary encoding). if dictionary.enchant_dict.check(input_phrase_nfc): # This is a valid word in this dictionary. # It might have been missed by the matching # above because the dictionary might not # contain all possible word forms (The # prefix and suffix information has been # ignored). But hunspell knows about this, # if hunspell thinks it is a correct word, # it must be counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.enchant_dict.suggest(input_phrase_nfc) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words]) elif dictionary.pyhunspell_object: if len(input_phrase) >= 4: # Always pass NFC to pyhunspell and convert # the result back to the internal # normalization form (NFD) (hunspell does the # right thing for Korean if the input is NFC). if dictionary.pyhunspell_object.spell( input_phrase_nfc.encode( dictionary.encoding, 'replace')): # This is a valid word in this dictionary. # It might have been missed by the matching # above because the dictionary might not # contain all possible word forms (The # prefix and suffix information has been # ignored). But hunspell knows about this, # if hunspell thinks it is a correct word, # it must be counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.pyhunspell_object.suggest( input_phrase_nfc.encode( dictionary.encoding, 'replace')) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words]) else: if (dictionary.name[:2] not in ('ja', 'ja_JP', 'zh', 'zh_CN', 'zh_TW', 'zh_MO', 'zh_SG')): # For some languages, hunspell dictionaries don’t # exist because hunspell makes no sense for these # languages. In these cases, just ignore that the # hunspell dictionary is missing. With the # appropriate input method added, emoji can be # matched nevertheless. suggested_words.update([ ('☹ %(name)s dictionary not found. ' %{'name': dictionary.name} + 'Please install hunspell dictionary!', 0)]) for word in suggested_words: if (suggested_words[word] == -1 and itb_util.remove_accents(word) == itb_util.remove_accents(input_phrase)): # This spell checking correction is actually even # an accent insensitive match, adjust accordingly: suggested_words[word] = 0 sorted_suggestions = sorted( suggested_words.items(), key=lambda x: ( - x[1], # 0: in dictionary, -1: hunspell len(x[0]), # length of word ascending x[0], # alphabetical ))[0:MAX_WORDS] self._suggest_cache[input_phrase] = sorted_suggestions return sorted_suggestions