Python SpellChecker Examples, whoosh.spelling.SpellChecker Python Examples

Example #1

0

Show file

 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = AsyncWriter(self.index)
     
     for obj in iterable:
         doc = index.full_prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in doc:
             doc[key] = self._from_python(doc[key])
         
         writer.update_document(**doc)
     
     if len(iterable) > 0:
         # For now, commit no matter what, as we run into locking issues otherwise.
         writer.commit()
         
         # If spelling support is desired, add to the dictionary.
         if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
             sp = SpellChecker(self.storage)
             sp.add_field(self.index, self.content_field_name)

Example #2

0

Show file

 def create_spelling_suggestion(self, query_string):
     spelling_suggestion = None
     sp = SpellChecker(self.storage)
     cleaned_query = force_unicode(query_string)
     
     if not query_string:
         return spelling_suggestion
     
     # Clean the string.
     for rev_word in self.RESERVED_WORDS:
         cleaned_query = cleaned_query.replace(rev_word, '')
     
     for rev_char in self.RESERVED_CHARACTERS:
         cleaned_query = cleaned_query.replace(rev_char, '')
     
     # Break it down.
     query_words = cleaned_query.split()
     suggested_words = []
     
     for word in query_words:
         suggestions = sp.suggest(word, number=1)
         
         if len(suggestions) > 0:
             suggested_words.append(suggestions[0])
     
     spelling_suggestion = ' '.join(suggested_words)
     return spelling_suggestion

Example #3

0

Show file

File: whoosh_backend.py Project: wxtr/django-haystack

 def update(self, index, iterable, commit=True):
     if not self.setup_complete:
         self.setup()
     
     self.index = self.index.refresh()
     writer = self.index.writer()
     
     for obj in iterable:
         doc = {}
         doc['id'] = force_unicode(self.get_identifier(obj))
         doc['django_ct'] = force_unicode("%s.%s" % (obj._meta.app_label, obj._meta.module_name))
         doc['django_id'] = force_unicode(obj.pk)
         other_data = index.prepare(obj)
         
         # Really make sure it's unicode, because Whoosh won't have it any
         # other way.
         for key in other_data:
             other_data[key] = self._from_python(other_data[key])
         
         doc.update(other_data)
         writer.update_document(**doc)
     
     # For now, commit no matter what, as we run into locking issues otherwise.
     writer.commit()
     
     # If spelling support is desired, add to the dictionary.
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
         sp = SpellChecker(self.storage)
         sp.add_field(self.index, self.content_field_name)

Example #4

0

Show file

File: search.py Project: toastdriven/lilrocket

 def update(self, documents, commit=True):
     writer = self.index.writer()
     
     for doc in documents:
         writer.update_document(**doc)
     
     if commit is True:
         writer.commit()
     
     # If spelling support is desired, add to the dictionary.
     if self.include_spelling is True:
         sp = SpellChecker(self.storage)
         sp.add_field(self.index, self.content_field_name)

Example #5

0

Show file

File: search_window.py Project: mutita/E-Tipitaka-for-PC

    def __init__(self):
        wx.Frame.__init__(self, parent=None, id=-1, 
            title=u'โปรแกรมตรวจหาและเทียบเคียงพุทธวจนจากพระไตรปิฎก (E-Tipitaka %s)'%(manifest.__version__),size=(1000,700))

        icon = wx.IconBundle()
        icon.AddIconFromFile(os.path.join('resources','e-tri_64_icon.ico'), wx.BITMAP_TYPE_ANY)
        self.SetIcons(icon)

        self.now = 0
        self.total = 0
        self.pages = 0
        self.per = 10
        self.lang = 'thai'
        self.mode = 'all'
        self.speller = {}
        self.checkedItems = range(45)
        self.read = []
        self.wildcard = u'E-Tipitaka Backup File (*.etz)|*.etz'

        for lang in ['thai','pali']:
            st = FileStorage(os.path.join('spell_%s'%(lang)))
            self.speller[lang] = SpellChecker(st)

        f = open(os.path.join('resources','book_name.pkl'),'rb')
        self.bookNames = cPickle.load(f)
        f.close()
 
        self.resultWindow = ResultWindow(self.lang,self)

        self.font = self.LoadFont()
        if self.font != None and self.font.IsOk():
            self.resultWindow.SetStandardFonts(self.font.GetPointSize(),self.font.GetFaceName())                    

        self.statusBar = self.CreateMyStatusBar()
        self.topSizer = self.CreateSearchBar()

        self.sizer = wx.BoxSizer(wx.VERTICAL)
        self.sizer.Add(self.topSizer,flag=wx.EXPAND)
        self.sizer.Add(self.resultWindow,1,flag=wx.EXPAND)
        self.SetSizer(self.sizer)
    
        self.Bind(wx.EVT_CLOSE, self.OnClose)

Example #6

0

Show file

class WhooshSearchBackend(BaseSearchBackend):
    # Word reserved by Whoosh for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Whoosh for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '.',
    )

    def __init__(self, connection_alias, **connection_options):
        super(WhooshSearchBackend, self).__init__(connection_alias,
                                                  **connection_options)
        self.setup_complete = False
        self.use_file_storage = True
        self.post_limit = getattr(connection_options, 'POST_LIMIT',
                                  128 * 1024 * 1024)
        self.path = connection_options.get('PATH')

        if connection_options.get('STORAGE', 'file') != 'file':
            self.use_file_storage = False

        if self.use_file_storage and not self.path:
            raise ImproperlyConfigured(
                "You must specify a 'PATH' in your settings for connection '%s'."
                % connection_alias)

        self.log = logging.getLogger('haystack')

    def setup(self):
        """
        Defers loading until needed.
        """
        from haystack import connections
        new_index = False

        # Make sure the index is there.
        if self.use_file_storage and not os.path.exists(self.path):
            os.makedirs(self.path)
            new_index = True

        if self.use_file_storage and not os.access(self.path, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % self.path)

        if self.use_file_storage:
            self.storage = FileStorage(self.path)
        else:
            global LOCALS

            if LOCALS.RAM_STORE is None:
                LOCALS.RAM_STORE = RamStorage()

            self.storage = LOCALS.RAM_STORE

        self.content_field_name, self.schema = self.build_schema(connections[
            self.connection_alias].get_unified_index().all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = self.storage.create_index(self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = self.storage.create_index(self.schema)

        self.setup_complete = True

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    type=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

    def update(self, index, iterable, commit=True):
        if not self.setup_complete:
            self.setup()

        self.index = self.index.refresh()
        writer = AsyncWriter(self.index)

        for obj in iterable:
            doc = index.full_prepare(obj)

            # Really make sure it's unicode, because Whoosh won't have it any
            # other way.
            for key in doc:
                doc[key] = self._from_python(doc[key])

            try:
                writer.update_document(**doc)
            except Exception, e:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"%s while preparing object for update" %
                               e.__name__,
                               exc_info=True,
                               extra={
                                   "data": {
                                       "index": index,
                                       "object": get_identifier(obj)
                                   }
                               })

        if len(iterable) > 0:
            # For now, commit no matter what, as we run into locking issues otherwise.
            writer.commit()

            # If spelling support is desired, add to the dictionary.
            if self.include_spelling is True:
                sp = SpellChecker(self.storage)
                sp.add_field(self.index, self.content_field_name)

Example #7

0

Show file

THAI_SCRIPT_DB = os.path.join(RESOURCES_DIR, 'thaict.sqlite')
ROMAN_SCRIPT_DB = os.path.join(RESOURCES_DIR, 'romanct.sqlite')
THAI_WATNA_DB = os.path.join(RESOURCES_DIR, 'thaiwn.sqlite')
THAI_POCKET_BOOK_DB = os.path.join(RESOURCES_DIR, 'thaipb-v9.sqlite')
PALI_MAHACHULA_DB = os.path.join(RESOURCES_DIR, 'palimc.sqlite')
THAI_SUPREME_DB = os.path.join(RESOURCES_DIR, 'thaims.sqlite')
THAI_VINAYA_DB = os.path.join(RESOURCES_DIR, 'thaivn.sqlite')

PALI_SIAM_DB = os.path.join(RESOURCES_DIR, 'pali.sqlite')
PALI_SIAM_NEW_DB = os.path.join(RESOURCES_DIR, 'palinew.sqlite')

PALI_DICT_DB = os.path.join(RESOURCES_DIR, 'p2t_dict.sqlite')
THAI_DICT_DB = os.path.join(RESOURCES_DIR, 'thaidict.sqlite')
ENGLISH_DICT_DB = os.path.join(RESOURCES_DIR, 'pali-english.sqlite')

THAI_SPELL_CHECKER = SpellChecker(FileStorage(os.path.join(RESOURCES_DIR, 'spell_thai')))
PALI_SPELL_CHECKER = SpellChecker(FileStorage(os.path.join(RESOURCES_DIR, 'spell_pali')))

BOOK_NAMES = cPickle.load(open(os.path.join(RESOURCES_DIR, 'book_name.pkl'), 'rb'))
BOOK_PAGES = cPickle.load(open(os.path.join(RESOURCES_DIR, 'book_page.pkl'),  'rb'))
BOOK_ITEMS = cPickle.load(open(os.path.join(RESOURCES_DIR, 'book_item.pkl'),  'rb'))
VOLUME_TABLE = cPickle.load(open(os.path.join(RESOURCES_DIR, 'maps.pkl'),  'rb'))
SCRIPT_ITEMS = json.loads(open(os.path.join(RESOURCES_DIR, 'ct_items.json')).read())

MAP_MC_TO_SIAM = cPickle.load(open(os.path.join(RESOURCES_DIR, 'mc_map.pkl'), 'rb'))
MAP_MS_TO_SIAM = cPickle.load(open(os.path.join(RESOURCES_DIR, 'ms_map.pkl'), 'rb'))

FIVE_BOOKS_TOC = json.loads(open(os.path.join(RESOURCES_DIR, 'bt_toc.json')).read())

ROMAN_SCRIPT_TOC = json.loads(open(os.path.join(RESOURCES_DIR, 'toc_rm.json')).read())
THAI_SCRIPT_TOC = json.loads(open(os.path.join(RESOURCES_DIR, 'toc_th.json')).read())

Example #8

0

Show file

File: search.py Project: jtedesco/ProfessionalWebsite

def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {
        'content': 1.0,
        'title': 3.0
    }
    query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter),
            'url': search_result['url'],
            'title': search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count

Example #9

0

Show file

File: search.py Project: okute/ProfessionalWebsite

def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count