def __init__(self, index_dir: str): ts = TurkishStemmer() self.__schema = fields.Schema( message=fields.TEXT(stored=True, field_boost=1.5, analyzer=analysis.StemmingAnalyzer() | analysis.NgramFilter(minsize=2, maxsize=5)), meta_content=fields.TEXT( stored=True, analyzer=analysis.StemmingAnalyzer() | analysis.NgramFilter(minsize=2, maxsize=5)), message_id=fields.NUMERIC(stored=True, bits=64), chat_id=fields.NUMERIC(stored=True, bits=64), message_tr=fields.TEXT( stored=False, field_boost=1.5, analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem, stoplist=STOP_WORDS_TR) | analysis.NgramFilter(minsize=2, maxsize=5)), meta_content_tr=fields.TEXT( stored=False, analyzer=analysis.StemmingAnalyzer(stemfn=ts.stem, stoplist=STOP_WORDS_TR) | analysis.NgramFilter(minsize=2, maxsize=5)), ) if not os.path.isdir(index_dir): os.mkdir(index_dir) self.__index = index.create_in(index_dir, self.__schema) else: self.__index = index.open_dir(index_dir)
def test_multi_language(): # Analyzer for English ana_eng = analysis.StemmingAnalyzer() # analyzer for Pig Latin def stem_piglatin(w): if w.endswith("ay"): w = w[:-2] return w ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"], stemfn=stem_piglatin) # Dictionary mapping languages to analyzers analyzers = {"eng": ana_eng, "pig": ana_pig} # Fake documents corpus = [(u("eng"), u("Such stuff as dreams are made on")), (u("pig"), u("Otay ebay, roay otnay otay ebay"))] schema = fields.Schema(content=fields.TEXT(stored=True), lang=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for doclang, content in corpus: ana = analyzers[doclang] # "Pre-analyze" the field into token strings words = [token.text for token in ana(content)] # Note we store the original value but index the pre-analyzed words w.add_document(lang=doclang, content=words, _stored_content=content) with ix.searcher() as s: schema = s.schema # Modify the schema to fake the correct analyzer for the language # we're searching in schema["content"].analyzer = analyzers["eng"] qp = qparser.QueryParser("content", schema) q = qp.parse("dreaming") r = s.search(q) assert len(r) == 1 assert r[0]["content"] == "Such stuff as dreams are made on" schema["content"].analyzer = analyzers["pig"] qp = qparser.QueryParser("content", schema) q = qp.parse("otnay") r = s.search(q) assert len(r) == 1 assert r[0]["content"] == "Otay ebay, roay otnay otay ebay"
def test_add_reader_spelling(): # Test whether add_spell_word() items get copied over in a merge # Because b is stemming and spelled, it will use add_spell_word() ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema, "addreadersp") as ix: with ix.writer() as w: w.add_document(a=u("rendering modeling compositing enabling"), b=u("rendering modeling compositing enabling")) w.add_document(a=u("flying rolling tying quitting polling"), b=u("flying rolling tying quitting polling")) with ix.writer() as w: w.add_document(a=u("writing eyeing ripping timing yelling"), b=u("writing eyeing ripping timing yelling")) w.add_document(a=u("undoing indicating opening pressing"), b=u("undoing indicating opening pressing")) with ix.searcher() as s: gr = s.reader().word_graph("a") assert " ".join(gr.flatten_strings()) == ( "compositing enabling eyeing flying indicating " "modeling opening polling pressing quitting " "rendering ripping rolling timing tying undoing " "writing yelling") gr = s.reader().word_graph("b") assert " ".join(gr.flatten_strings()) == ( "compositing enabling eyeing flying indicating " "modeling opening polling pressing quitting " "rendering ripping rolling timing tying undoing " "writing yelling")
def test_analyzing_terms(self): schema = fields.Schema(text=fields.TEXT( analyzer=analysis.StemmingAnalyzer())) qp = qparser.QueryParser("text", schema=schema) q = qp.parse(u"Indexed!") self.assertEqual(q.__class__.__name__, "Term") self.assertEqual(q.text, "index")
def test_analyzing_terms(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana)) qp = default.QueryParser("text", schema) q = qp.parse(u("Indexed!")) assert q.__class__ == query.Term assert q.text == "index"
def test_add_reader_spelling(): # Test whether add_spell_word() items get copied over in a merge # Because b is stemming and spelled, it will use add_spell_word() ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(analyzer=ana), b=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema, "addreadersp") as ix: with ix.writer() as w: w.add_document(a=u"rendering modeling", b=u"rendering modeling") w.add_document(a=u"flying rolling", b=u"flying rolling") with ix.writer() as w: w.add_document(a=u"writing eyeing", b=u"writing eyeing") w.add_document(a=u"undoing indicating", b=u"undoing indicating") w.optimize = True with ix.reader() as r: sws = list(r.lexicon("spell_b")) assert sws == [b"eyeing", b"flying", b"indicating", b"modeling", b"rendering", b"rolling", b"undoing", b"writing"] assert list(r.terms_within("a", "undoink", 1)) == [] assert list(r.terms_within("b", "undoink", 1)) == ["undoing"]
def test_spelling_field(): text = u"rendering shading modeling reactions" ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) assert schema["text"].spelling assert schema["text"].separate_spelling() with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(text=text) with ix.searcher() as s: r = s.reader() fieldobj = schema["text"] words = [fieldobj.from_bytes(t) for t in r.lexicon("text")] assert words == ["model", "reaction", "render", "shade"] words = [fieldobj.from_bytes(t) for t in r.lexicon("spell_text")] assert words == ["modeling", "reactions", "rendering", "shading"] # suggest() automatically looks in the spell_text field because # it calls fieldobj.spelling_fieldname() first assert s.suggest("text", "renderink") == ["rendering"] with ix.writer() as w: w.delete_document(0)
def add_key_terms(ix): s = ix.searcher() w = ix.writer() stemmer = analysis.StemmingAnalyzer() print("Adding key terms...") last_book = None for doc_num in s.document_numbers(): fields = s.stored_fields(doc_num) if fields['book_name'] != last_book: last_book = fields['book_name'] print(last_book) m = re.search(r'session (\d+)', fields['session'], flags=re.IGNORECASE) is_session_num = lambda k: re.match(r'{0}(st|nd|rd|th)?'.format(m.group(1)), k) if m else False key_terms = [k for k, v in s.key_terms([doc_num], 'key_terms_content', numterms=10) if not is_session_num(k)] stemmed = [t.text for t in stemmer(' '.join(key_terms))] final_terms = [] final_stemmed = set() for (term, stemmed_term) in zip(key_terms, stemmed): if stemmed_term not in final_stemmed: final_terms.append(term) final_stemmed.add(stemmed_term) fields['key_terms'] = final_terms fields['stemmed'] = fields['key_terms_content'] fields['exact'] = fields['key_terms_content'] fields['common'] = fields['key_terms_content'] del fields['key_terms_content'] w.delete_document(doc_num) w.add_document(**fields) w.commit()
def test_issue324(): sa = analysis.StemmingAnalyzer() result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa, fragmenter=highlight.ContextFragmenter(), formatter=highlight.UppercaseFormatter()) assert result == "INDEXED!\n1"
def test_pickle_schema(): from whoosh import analysis from whoosh.support.charset import accent_map from whoosh.compat import dumps freetext_analyzer = (analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map)) schema = fields.Schema(path=fields.ID(stored=True, unique=True), file_mtime=fields.DATETIME(stored=True), name=fields.TEXT(stored=False, field_boost=2.0), description=fields.TEXT(stored=False, field_boost=1.5, analyzer=freetext_analyzer), content=fields.TEXT(analyzer=freetext_analyzer)) # Try to make some sentences that will require stemming docs = [ u"The rain in spain falls mainly in the plain", u"Plainly sitting on the plain", u"Imagine a greatly improved sentence here" ] with TempIndex(schema) as ix: with ix.writer() as w: for doc in docs: w.add_document(description=doc, content=doc) assert dumps(schema, 2) with ix.reader() as r: assert dumps(r.schema, 2)
def test_correct_spell_field(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(text=u"rendering shading modeling reactions") with ix.searcher() as s: text = s.schema["text"] spell_text = s.schema["spell_text"] r = s.reader() words = [text.from_bytes(t) for t in r.lexicon("text")] assert words == ["model", "reaction", "render", "shade"] words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")] assert words == ["modeling", "reactions", "rendering", "shading"] qp = QueryParser("text", s.schema) qtext = u"renderink" q = qp.parse(qtext, s.schema) r = s.search(q) assert len(r) == 0 c = s.correct_query(q, qtext) assert c.string == "rendering" assert c.query == query.Term("text", "rendering") hf = highlight.HtmlFormatter(classname="c") assert c.format_string( hf) == '<strong class="c term0">rendering</strong>'
def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join( s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo"
def __init__(self, analyzer=NOT_PROVIDED, **kwargs): if kwargs.get("facet_class") is None: kwargs["facet_class"] = FacetCharField # use StemmingAnalyzer by default kwargs["analyzer"] = ( analysis.StemmingAnalyzer() if analyzer is NOT_PROVIDED else analyzer ) super().__init__(**kwargs)
def test_bypass_stemming2(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document( content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00")) w.add_document(content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study")) w.add_document(content=u("This is the first document we've added!"))
def create_whoosh_index(cls): indexdir = os.path.sep.join( [config['datadir'], cls.module_dir, 'index']) if not os.path.exists(indexdir): os.mkdir(indexdir) print "Creating a new index" ana = analysis.StemmingAnalyzer() schema = Schema(title=TEXT(stored=True), basefile=ID(stored=True, unique=True), content=TEXT) # FIXME: Get a keyword list, correct title, and list of treaty # references (celex nums as keywords or uris or...) whoosh_ix = create_in(indexdir, schema) base_dir = config['datadir'] from time import time for basefile in cls.get_iterable_for("relate_all", base_dir): if not ("J" in basefile or "A" in basefile or "K" in basefile): continue readstart = time() # just save the text from the document, strip out the tags from BeautifulSoup import BeautifulSoup m = cls.re_celexno.match(basefile) year = m.group(2) parsed_file = os.path.sep.join([ base_dir, cls.module_dir, u'parsed', year, basefile + '.xhtml' ]) soup = BeautifulSoup(open(parsed_file).read()) text = ''.join(soup.findAll(text=True)) # Skip the first 150 chars (XML junk) and normalize space text = ' '.join(text[150:].split()) if text: indexstart = time() writer = whoosh_ix.writer() writer.update_document(title="Case " + basefile, basefile=basefile, content=text) writer.commit() print "Added %s '%s...' %.1f kb in %.3f + %.3f s" % ( basefile, text[:39], len(text) / 1024, indexstart - readstart, time() - indexstart) else: print "Noadd %s (no text)" % (basefile) searcher = whoosh_ix.searcher() results = searcher.find("content", "quantitative imports equivalent prohibited", limit=10) for i in range(len(results)): print "%s: %s" % (results[i]['title'], results.score(i))
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): if not os.path.exists(ixdir): os.mkdir(ixdir) # Multi-lingual stop words stoplist = (analysis.STOP_WORDS | set("de la der und le die et en al no von di du da " "del zur ein".split())) # Schema ana = analysis.StemmingAnalyzer(stoplist=stoplist) schema = fields.Schema( title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), subject=fields.TEXT(analyzer=ana, phrase=False), file=fields.STORED, pos=fields.STORED, ) # MARC fields to extract mfields = set(subjectfields) # Subjects mfields.update("100 110 111".split()) # Author mfields.add("245") # Title print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb)) c = 0 t = now() ix = index.create_in(ixdir, schema) with ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as w: filenames = [ filename for filename in os.listdir(basedir) if fnmatch.fnmatch(filename, glob) ] for filename in filenames: path = os.path.join(basedir, filename) print("Indexing", path) f = open(path, 'rb') for x, pos in read_file(f, mfields): w.add_document(title=uni(title(x)), author=uni(author(x)), subject=uni(subjects(x)), file=filename, pos=pos) c += 1 f.close() print("Committing...") print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
def whoosh_schema(self): ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None) storebody = self.options.storebody schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=storebody), filepos=fields.STORED, date=fields.ID(stored=True), frm=fields.ID(stored=True), to=fields.IDLIST(stored=True), subject=fields.TEXT(stored=True), cc=fields.IDLIST, bcc=fields.IDLIST) return schema
def test_spellable_list(): # Make sure a spellable field works with a list of pre-analyzed tokens ana = analysis.StemmingAnalyzer() schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED, Title=fields.TEXT(spelling=True, analyzer=ana)) ix = RamStorage().create_index(schema) doc = {'Location': '1000/123', 'Lang': 'E', 'Title': ['Introduction', 'Numerical', 'Analysis']} with ix.writer() as w: w.add_document(**doc)
def test_bypass_stemming(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("rendering shading modeling reactions")) w.commit() with ix.reader() as r: assert_equal(list(r.lexicon("text")), ["model", "reaction", "render", "shade"]) assert_equal(list(r.word_graph("text").flatten_strings()), ["modeling", "reactions", "rendering", "shading"])
def populate_whoosh(text_dir, whoosh_dir): loaded = 0 # Create analyzer used for tokenizing and normalizing tokens # 000, 001, 010, 011, my_analyzers = [(analysis.RegexTokenizer()), (analysis.RegexTokenizer() | analysis.LowercaseFilter()), (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter()), (analysis.StemmingAnalyzer())] # Create schemas schemas = [] for my_analyzer in my_analyzers: schema = Schema(url=ID(stored=True), body=TEXT(stored=True, analyzer=my_analyzer)) schemas.append(schema) # Setup index ixs = [] for i, my_analyzer in enumerate(my_analyzers): whoosh_dir_current = whoosh_dir + str(i) + '/' os.makedirs(whoosh_dir_current, exist_ok=True) ix = index.create_in(whoosh_dir_current, schemas[i]) ixs.append(ix) # Clear index writers = [] for i, my_analyzer in enumerate(my_analyzer): writer = ixs[i].writer() writer.commit(mergetype=writing.CLEAR) writer = ixs[i].writer() writers.append(writer) # Index documents for root, dirs, files in os.walk(text_dir, topdown=False): for name in files: text_file = os.path.join(root, name) print('.', end='') with open(text_file) as tf: body = tf.read() url = text_file.replace(text_dir, "") for writer in writers: writer.add_document(url=url, body=body) # print("Added", url) loaded += 1 for writer in writers: writer.commit() print("\n\nLoaded", loaded, "documents")
def get_index(self): stem_ana = analysis.StemmingAnalyzer() schema = fields.Schema( id=fields.ID(unique=True), datetime=fields.DATETIME(sortable=True), reply=fields.BOOLEAN, retweet=fields.BOOLEAN, text=fields.TEXT(analyzer=stem_ana, stored=True) ) index_dir = os.path.join(self.dir, "index") if os.path.exists(index_dir): self.index = index.open_dir(index_dir) else: os.mkdir(index_dir) self.index = index.create_in(index_dir, schema)
def open_index(): from whoosh import index, fields as f if os.path.isdir(app.config['WHOOSH_INDEX']): return index.open_dir(app.config['WHOOSH_INDEX']) os.mkdir(app.config['WHOOSH_INDEX']) analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map) schema = f.Schema(url=f.ID(stored=True, unique=True), id=f.ID(stored=True), title=f.TEXT(stored=True, field_boost=2.0, analyzer=analyzer), type=f.ID(stored=True), keywords=f.KEYWORD(commas=True), content=f.TEXT(analyzer=analyzer)) return index.create_in(app.config['WHOOSH_INDEX'], schema)
def test_spelling_field_order(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT, b=fields.TEXT(analyzer=ana), c=fields.TEXT, d=fields.TEXT(analyzer=ana), e=fields.TEXT(analyzer=ana), f=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): value = " ".join(ls) w.add_document(a=value, b=value, c=value, d=value, e=value, f=value) w.commit()
def test_very_long_words(): import sys length = int(sys.getrecursionlimit() * 1.5) strings1 = [u(chr(i) * length) for i in range(65, 70)] strings2 = [u(chr(i) * length) for i in range(71, 75)] ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for string in strings1: w.add_document(text=string) with ix.writer() as w: for string in strings2: w.add_document(text=string) w.optimize = True
def test_multivalue(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(id=fields.STORED, date=fields.DATETIME, num=fields.NUMERIC, txt=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=1, date=datetime(2001, 1, 1), num=5) w.add_document(id=2, date=[datetime(2002, 2, 2), datetime(2003, 3, 3)], num=[1, 2, 3, 12]) w.add_document(txt=u("a b c").split()) with ix.reader() as r: assert ("num", 3) in r assert ("date", datetime(2003, 3, 3)) in r assert " ".join(r.field_terms("txt")) == "a b c"
def test_missing_suggestion(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True), organism=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(organism=u("hs"), content=u("cells")) w.add_document(organism=u("hs"), content=u("cell")) with ix.searcher() as s: r = s.reader() assert r.has_word_graph("content") gr = r.word_graph("content") assert list(gr.flatten()) == [b("cell"), b("cells")] c = s.corrector("content") # Note that corrector won't suggest the word you submit even though it's # in the index assert c.suggest("cell") == ["cells"]
def test_snippets(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(stored=True, analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u( "Lay out the rough animation by creating the important poses where they occur on the timeline." )) w.add_document(text=u( "Set key frames on everything that's key-able. This is for control and predictability: you don't want to accidentally leave something un-keyed. This is also much faster than selecting the parameters to key." )) w.add_document(text=u( "Use constant (straight) or sometimes linear transitions between keyframes in the channel editor. This makes the character jump between poses." )) w.add_document(text=u( "Keying everything gives quick, immediate results. But it can become difficult to tweak the animation later, especially for complex characters." )) w.add_document(text=u( "Copy the current pose to create the next one: pose the character, key everything, then copy the keyframe in the playbar to another frame, and key everything at that frame." )) w.commit() target = [ "Set KEY frames on everything that's KEY-able", "Copy the current pose to create the next one: pose the character, KEY everything, then copy the keyframe in the playbar to another frame, and KEY everything at that frame", "KEYING everything gives quick, immediate results" ] with ix.searcher() as s: qp = qparser.QueryParser("text", ix.schema) q = qp.parse(u("key")) r = s.search(q, terms=True) r.fragmenter = highlight.SentenceFragmenter() r.formatter = highlight.UppercaseFormatter() assert sorted([hit.highlights("text", top=1) for hit in r]) == sorted(target)
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def test_analyzing_terms(): schema = fields.Schema(text=fields.TEXT(analyzer=analysis.StemmingAnalyzer())) qp = default.QueryParser("text", schema) q = qp.parse(u("Indexed!")) assert_equal(q.__class__, query.Term) assert_equal(q.text, "index")
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [ term.replace('*', '') for term in query_string.split() ] additional_fields['highlighted'] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter()) ], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }