def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def create_spelling_suggestion(self, query_string): spelling_suggestion = None sp = SpellChecker(self.storage) cleaned_query = force_unicode(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = sp.suggest(word, number=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion
def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = self.index.writer() for obj in iterable: doc = {} doc['id'] = force_unicode(self.get_identifier(obj)) doc['django_ct'] = force_unicode("%s.%s" % (obj._meta.app_label, obj._meta.module_name)) doc['django_id'] = force_unicode(obj.pk) other_data = index.prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in other_data: other_data[key] = self._from_python(other_data[key]) doc.update(other_data) writer.update_document(**doc) # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def update(self, documents, commit=True): writer = self.index.writer() for doc in documents: writer.update_document(**doc) if commit is True: writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def __init__(self): wx.Frame.__init__(self, parent=None, id=-1, title=u'โปรแกรมตรวจหาและเทียบเคียงพุทธวจนจากพระไตรปิฎก (E-Tipitaka %s)'%(manifest.__version__),size=(1000,700)) icon = wx.IconBundle() icon.AddIconFromFile(os.path.join('resources','e-tri_64_icon.ico'), wx.BITMAP_TYPE_ANY) self.SetIcons(icon) self.now = 0 self.total = 0 self.pages = 0 self.per = 10 self.lang = 'thai' self.mode = 'all' self.speller = {} self.checkedItems = range(45) self.read = [] self.wildcard = u'E-Tipitaka Backup File (*.etz)|*.etz' for lang in ['thai','pali']: st = FileStorage(os.path.join('spell_%s'%(lang))) self.speller[lang] = SpellChecker(st) f = open(os.path.join('resources','book_name.pkl'),'rb') self.bookNames = cPickle.load(f) f.close() self.resultWindow = ResultWindow(self.lang,self) self.font = self.LoadFont() if self.font != None and self.font.IsOk(): self.resultWindow.SetStandardFonts(self.font.GetPointSize(),self.font.GetFaceName()) self.statusBar = self.CreateMyStatusBar() self.topSizer = self.CreateSearchBar() self.sizer = wx.BoxSizer(wx.VERTICAL) self.sizer.Add(self.topSizer,flag=wx.EXPAND) self.sizer.Add(self.resultWindow,1,flag=wx.EXPAND) self.SetSizer(self.sizer) self.Bind(wx.EVT_CLOSE, self.OnClose)
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
THAI_SCRIPT_DB = os.path.join(RESOURCES_DIR, 'thaict.sqlite') ROMAN_SCRIPT_DB = os.path.join(RESOURCES_DIR, 'romanct.sqlite') THAI_WATNA_DB = os.path.join(RESOURCES_DIR, 'thaiwn.sqlite') THAI_POCKET_BOOK_DB = os.path.join(RESOURCES_DIR, 'thaipb-v9.sqlite') PALI_MAHACHULA_DB = os.path.join(RESOURCES_DIR, 'palimc.sqlite') THAI_SUPREME_DB = os.path.join(RESOURCES_DIR, 'thaims.sqlite') THAI_VINAYA_DB = os.path.join(RESOURCES_DIR, 'thaivn.sqlite') PALI_SIAM_DB = os.path.join(RESOURCES_DIR, 'pali.sqlite') PALI_SIAM_NEW_DB = os.path.join(RESOURCES_DIR, 'palinew.sqlite') PALI_DICT_DB = os.path.join(RESOURCES_DIR, 'p2t_dict.sqlite') THAI_DICT_DB = os.path.join(RESOURCES_DIR, 'thaidict.sqlite') ENGLISH_DICT_DB = os.path.join(RESOURCES_DIR, 'pali-english.sqlite') THAI_SPELL_CHECKER = SpellChecker(FileStorage(os.path.join(RESOURCES_DIR, 'spell_thai'))) PALI_SPELL_CHECKER = SpellChecker(FileStorage(os.path.join(RESOURCES_DIR, 'spell_pali'))) BOOK_NAMES = cPickle.load(open(os.path.join(RESOURCES_DIR, 'book_name.pkl'), 'rb')) BOOK_PAGES = cPickle.load(open(os.path.join(RESOURCES_DIR, 'book_page.pkl'), 'rb')) BOOK_ITEMS = cPickle.load(open(os.path.join(RESOURCES_DIR, 'book_item.pkl'), 'rb')) VOLUME_TABLE = cPickle.load(open(os.path.join(RESOURCES_DIR, 'maps.pkl'), 'rb')) SCRIPT_ITEMS = json.loads(open(os.path.join(RESOURCES_DIR, 'ct_items.json')).read()) MAP_MC_TO_SIAM = cPickle.load(open(os.path.join(RESOURCES_DIR, 'mc_map.pkl'), 'rb')) MAP_MS_TO_SIAM = cPickle.load(open(os.path.join(RESOURCES_DIR, 'ms_map.pkl'), 'rb')) FIVE_BOOKS_TOC = json.loads(open(os.path.join(RESOURCES_DIR, 'bt_toc.json')).read()) ROMAN_SCRIPT_TOC = json.loads(open(os.path.join(RESOURCES_DIR, 'toc_rm.json')).read()) THAI_SCRIPT_TOC = json.loads(open(os.path.join(RESOURCES_DIR, 'toc_th.json')).read())
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = { 'content': 1.0, 'title': 3.0 } query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = {'content': 1.0, 'title': 3.0} query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count