def typeahead_whoosh_index(query): ix = get_whoosh_index() parser = MultifieldParser(['content', 'title'], ix.schema) a = analysis.SimpleAnalyzer() with ix.searcher() as searcher: corrector = searcher.corrector("typeahead") for token in [token.text for token in a(u'photographie de p')]: suggestionList = corrector.suggest(token, limit=10) print suggestionList
def test_url(): sample = u("Visit http://bitbucket.org/mchaput/whoosh or " + "urn:isbn:5930502 or http://www.apple.com/.") anas = [analysis.SimpleAnalyzer(analysis.url_pattern), analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None)] for ana in anas: ts = [t.text for t in ana(sample)] assert ts == [u('visit'), u('http://bitbucket.org/mchaput/whoosh'), u('or'), u('urn:isbn:5930502'), u('or'), u('http://www.apple.com/')]
def test_nonexistant_fieldnames(): # Need an analyzer that won't mangle a URL a = analysis.SimpleAnalyzer("\\S+") schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a)) qp = default.QueryParser("text", schema) q = qp.parse(u("id:/code http://localhost/")) assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[0].fieldname == "id" assert q[0].text == "/code" assert q[1].__class__ == query.Term assert q[1].fieldname == "text" assert q[1].text == "http://localhost/"
def __init__(self, toolbox, index_help=True): """ Create a searcher for `toolbox`. """ self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.build_index(index_help)
def __init__(self, toolbox, panel_view_id: str, index_dir: str, index_help: bool = True): self.schema = Schema(id=ID(stored=True, unique=True), old_id=ID, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.index_dir = index_dir self.toolbox = toolbox self.panel_view_id = panel_view_id self.index = self._index_setup()
def test_span_near2(): ana = analysis.SimpleAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(text=u("The Lucene library is by Doug Cutting and Whoosh " + "was made by Matt Chaput")) w.commit() nq1 = spans.SpanNear(Term("text", "lucene"), Term("text", "doug"), slop=5) nq2 = spans.SpanNear(nq1, Term("text", "whoosh"), slop=4) with ix.searcher() as s: m = nq2.matcher(s) assert m.spans() == [spans.Span(1, 8)]
def __init__(self, toolbox, index_help=True): self.schema = Schema(id=STORED, stub=KEYWORD, name=TEXT(analyzer=analysis.SimpleAnalyzer()), description=TEXT, section=TEXT, help=TEXT, labels=KEYWORD) self.rex = analysis.RegexTokenizer() self.toolbox = toolbox self.storage, self.index = self._index_setup() # We keep track of how many times the tool index has been rebuilt. # We start at -1, so that after the first index the count is at 0, # which is the same as the toolbox reload count. This way we can skip # reindexing if the index count is equal to the toolbox reload count. self.index_count = -1
def test_phrase_order(): tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() for ls in permutations(["ape", "bay", "can", "day"], 4): writer.add_document(text=u(" ").join(ls)) writer.commit() with ix.searcher() as s: def result(q): r = s.search(q, limit=None, sortedby=None) return sorted([d['text'] for d in r]) q = Phrase("text", ["bay", "can", "day"]) assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])
def get_index(): global _ix if _ix is not None: return _ix charfield = fields.FieldType(formats.Characters(), analysis.SimpleAnalyzer(), scorable=True, stored=True) schema = fields.Schema(text=charfield) st = RamStorage() _ix = st.create_index(schema) w = _ix.writer() for ls in permutations(domain, 4): w.add_document(text=u(" ").join(ls), _stored_text=ls) w.commit() return _ix
def test_pos_scorer(): ana = analysis.SimpleAnalyzer() schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, key=u("0 0 1 0 0 0")) w.add_document(id=1, key=u("0 0 0 1 0 0")) w.add_document(id=2, key=u("0 1 0 0 0 0")) w.commit() w = ix.writer() w.add_document(id=3, key=u("0 0 0 0 0 1")) w.add_document(id=4, key=u("1 0 0 0 0 0")) w.add_document(id=5, key=u("0 0 0 0 1 0")) w.commit(merge=False) def pos_score_fn(searcher, fieldname, text, matcher): poses = matcher.value_as("positions") return 1.0 / (poses[0] + 1) pos_weighting = scoring.FunctionWeighting(pos_score_fn) s = ix.searcher(weighting=pos_weighting) r = s.search(query.Term("key", "1")) assert_equal([hit["id"] for hit in r], [4, 2, 0, 1, 5, 3])
] # Directory that will contain index dir_index_cran = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\Index Cranfield' dir_index_time = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\Index Time' # For each analyzer for schema_type in dir_specific: schema_type = '\\' + schema_type print(schema_type) # In this case we'll boost the title field by 1.5 if schema_type == '\\Field Booster': selected_analyzer = analysis.SimpleAnalyzer() # Create a Schema schema = Schema(id=ID(stored=True), title=TEXT(stored=False, analyzer=selected_analyzer, field_boost=2), content=TEXT(stored=False, analyzer=selected_analyzer)) # Create an empty-Index os.mkdir(dir_index_cran + schema_type) temp_dir_cran = dir_index_cran + schema_type create_in( temp_dir_cran, schema ) # --> in this case we won't create any schema for Time dataset since we won't be using 'title'
class WhooshBackend(Component): """ Implements Whoosh SearchBackend interface """ implements(ISearchBackend, ISystemInfoProvider) index_dir_setting = Option( BHSEARCH_CONFIG_SECTION, 'whoosh_index_dir', default='whoosh_index', doc="""Relative path is resolved relatively to the directory of the environment.""", doc_domain='bhsearch') advanced_security = Option( BHSEARCH_CONFIG_SECTION, 'advanced_security', default=False, doc="Check view permission for each document when retrieving results.", doc_domain='bhsearch') max_fragment_size = IntOption( BHSEARCH_CONFIG_SECTION, 'max_fragment_size', default=240, doc="The maximum number of characters allowed in a fragment.", doc_domain='bhsearch') fragment_surround = IntOption( BHSEARCH_CONFIG_SECTION, 'fragment_surround', default=60, doc="""The number of extra characters of context to add both before the first matched term and after the last matched term.""", doc_domain='bhsearch') #This is schema prototype. It will be changed later #TODO: add other fields support, add dynamic field support. #Schema must be driven by index participants SCHEMA = Schema( unique_id=ID(stored=True, unique=True), id=ID(stored=True), type=ID(stored=True), product=ID(stored=True), milestone=ID(stored=True), time=DATETIME(stored=True), due=DATETIME(stored=True), completed=DATETIME(stored=True), author=ID(stored=True), component=ID(stored=True), status=ID(stored=True), resolution=ID(stored=True), keywords=KEYWORD(scorable=True), summary=TEXT(stored=True, analyzer=analysis.StandardAnalyzer(stoplist=None)), content=TEXT(stored=True, analyzer=analysis.StandardAnalyzer(stoplist=None)), changes=TEXT(analyzer=analysis.StandardAnalyzer(stoplist=None)), owner=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()), repository=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()), revision=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()), message=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()), required_permission=ID(), name=TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()), query_suggestion_basket=TEXT(analyzer=analysis.SimpleAnalyzer(), spelling=True), relations=KEYWORD(lowercase=True, commas=True), ) def __init__(self): self.index_dir = self.index_dir_setting if not os.path.isabs(self.index_dir): self.index_dir = os.path.join( get_global_env(self.env).path, self.index_dir) if index.exists_in(self.index_dir): self.index = index.open_dir(self.index_dir) else: self.index = None # ISystemInfoProvider methods def get_system_info(self): yield 'Whoosh', whoosh.versionstring() # ISearchBackend methods def start_operation(self): return self._create_writer() def _create_writer(self): return AsyncWriter(self.index) def add_doc(self, doc, operation_context=None): """Add any type of document index. The contents should be a dict with fields matching the search schema. The only required fields are type and id, everything else is optional. """ writer = operation_context is_local_writer = False if writer is None: is_local_writer = True writer = self._create_writer() self._reformat_doc(doc) doc[UNIQUE_ID] = self._create_unique_id(doc.get("product", ''), doc["type"], doc["id"]) self.log.debug("Doc to index: %s", doc) try: writer.update_document(**doc) if is_local_writer: writer.commit() except: if is_local_writer: writer.cancel() raise def _reformat_doc(self, doc): """ Strings must be converted unicode format accepted by Whoosh. """ for key, value in doc.items(): if key is None: del doc[None] elif value is None: del doc[key] elif isinstance(value, basestring) and value == "": del doc[key] else: doc[key] = self._to_whoosh_format(value) def delete_doc(self, product, doc_type, doc_id, operation_context=None): unique_id = self._create_unique_id(product, doc_type, doc_id) self.log.debug('Removing document from the index: %s', unique_id) writer = operation_context is_local_writer = False if writer is None: is_local_writer = True writer = self._create_writer() try: writer.delete_by_term(UNIQUE_ID, unique_id) if is_local_writer: writer.commit() except: if is_local_writer: writer.cancel() raise def optimize(self): writer = AsyncWriter(self.index) writer.commit(optimize=True) def is_index_outdated(self): return self.index is None or not self.index.schema == self.SCHEMA def recreate_index(self): self.log.info('Creating Whoosh index in %s' % self.index_dir) self._make_dir_if_not_exists() self.index = index.create_in(self.index_dir, schema=self.SCHEMA) return self.index def query(self, query, query_string=None, sort=None, fields=None, filter=None, facets=None, pagenum=1, pagelen=20, highlight=False, highlight_fields=None, context=None): # pylint: disable=too-many-locals with self.index.searcher() as searcher: self._apply_advanced_security(searcher, context) highlight_fields = self._prepare_highlight_fields( highlight, highlight_fields) sortedby = self._prepare_sortedby(sort) #TODO: investigate how faceting is applied to multi-value fields #e.g. keywords. For now, just pass facets lit to Whoosh API #groupedby = self._prepare_groupedby(facets) groupedby = facets query_parameters = dict( query=query, pagenum=pagenum, pagelen=pagelen, sortedby=sortedby, groupedby=groupedby, maptype=whoosh.sorting.Count, filter=filter, ) self.env.log.debug("Whoosh query to execute: %s", query_parameters) raw_page = searcher.search_page(**query_parameters) results = self._process_results(raw_page, fields, highlight_fields, query_parameters) if query_string is not None: c = searcher.correct_query(query, query_string) results.query_suggestion = c.string try: actual_query = unicode(query.simplify(searcher)) results.debug['actual_query'] = actual_query # pylint: disable=bare-except except: # Simplify has a bug that causes it to fail sometimes. pass return results def _apply_advanced_security(self, searcher, context=None): if not self.advanced_security: return old_collector = searcher.collector security_processor = SecurityPreprocessor(self.env) def check_permission(doc): return security_processor.check_permission(doc, context) def collector(*args, **kwargs): c = old_collector(*args, **kwargs) if isinstance(c, FilterCollector): c = AdvancedFilterCollector(c.child, c.allow, c.restrict, check_permission) else: c = AdvancedFilterCollector(c, None, None, check_permission) return c searcher.collector = collector def _create_unique_id(self, product, doc_type, doc_id): if product: return u"%s:%s:%s" % (product, doc_type, doc_id) else: return u"%s:%s" % (doc_type, doc_id) def _to_whoosh_format(self, value): if isinstance(value, basestring): value = unicode(value) elif isinstance(value, datetime): value = self._convert_date_to_tz_naive_utc(value) return value def _convert_date_to_tz_naive_utc(self, value): """Convert datetime to naive utc datetime Whoosh can not read from index datetime values passed from Trac with tzinfo=trac.util.datefmt.FixedOffset because of non-empty constructor of FixedOffset""" if value.tzinfo: utc_time = value.astimezone(utc) value = utc_time.replace(tzinfo=None) return value def _from_whoosh_format(self, value): if isinstance(value, datetime): value = utc.localize(value) return value def _prepare_groupedby(self, facets): if not facets: return None groupedby = whoosh.sorting.Facets() for facet_name in facets: groupedby.add_field(facet_name, allow_overlap=True, maptype=whoosh.sortingwhoosh.Count) return groupedby def _prepare_sortedby(self, sort): if not sort: return None sortedby = [] for sort_instruction in sort: field = sort_instruction.field order = sort_instruction.order if field.lower() == SCORE: if self._is_desc(order): #We can implement tis later by our own ScoreFacet with # "score DESC" support raise TracError( "Whoosh does not support DESC score ordering.") sort_condition = whoosh.sorting.ScoreFacet() else: sort_condition = whoosh.sorting.FieldFacet( field, reverse=self._is_desc(order)) sortedby.append(sort_condition) return sortedby def _prepare_highlight_fields(self, highlight, highlight_fields): if not highlight: return () if not highlight_fields: highlight_fields = self._all_highlightable_fields() return highlight_fields def _all_highlightable_fields(self): return [ name for name, field in self.SCHEMA.items() if self._is_highlightable(field) ] def _is_highlightable(self, field): return not isinstance(field, whoosh.fields.DATETIME) and field.stored def _is_desc(self, order): return (order.lower() == DESC) def _process_results(self, page, fields, highlight_fields, search_parameters=None): # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. """ :type fields: iterator :type page: ResultsPage """ results = QueryResult() results.hits = page.total results.total_page_count = page.pagecount results.page_number = page.pagenum results.offset = page.offset results.facets = self._load_facets(page) docs = [] highlighting = [] for retrieved_record in page: result_doc = self._process_record(fields, retrieved_record) docs.append(result_doc) result_highlights = self._create_highlights( highlight_fields, retrieved_record) highlighting.append(result_highlights) results.docs = docs results.highlighting = highlighting results.debug["search_parameters"] = search_parameters return results def _process_record(self, fields, retrieved_record): result_doc = dict() #add score field by default if not fields or SCORE in fields: score = retrieved_record.score result_doc[SCORE] = score if fields: for field in fields: if field in retrieved_record: result_doc[field] = retrieved_record[field] else: for key, value in retrieved_record.items(): result_doc[key] = value for key, value in result_doc.iteritems(): result_doc[key] = self._from_whoosh_format(value) return result_doc def _load_facets(self, page): """This method can be also used by unit-tests""" non_paged_results = page.results facet_names = non_paged_results.facet_names() if not facet_names: return None facets_result = dict() for name in facet_names: facets_result[name] = non_paged_results.groups(name) return facets_result def _make_dir_if_not_exists(self): if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) if not os.access(self.index_dir, os.W_OK): raise TracError( "The path to Whoosh index '%s' is not writable for the\ current user." % self.index_dir) def _create_highlights(self, fields, record): result_highlights = dict() fragmenter = whoosh.highlight.ContextFragmenter( self.max_fragment_size, self.fragment_surround, ) highlighter = whoosh.highlight.Highlighter( formatter=WhooshEmFormatter(), fragmenter=fragmenter) for field in fields: if field in record: highlighted = highlighter.highlight_hit(record, field) else: highlighted = '' result_highlights[field] = highlighted return result_highlights