def test_double_metaphone(): from whoosh.lang.dmetaphone import double_metaphone names = {'maurice': ('MRS', None), 'aubrey': ('APR', None), 'cambrillo': ('KMPRL', 'KMPR'), 'heidi': ('HT', None), 'katherine': ('K0RN', 'KTRN'), 'Thumbail': ('0MPL', 'TMPL'), 'catherine': ('K0RN', 'KTRN'), 'richard': ('RXRT', 'RKRT'), 'bob': ('PP', None), 'eric': ('ARK', None), 'geoff': ('JF', 'KF'), 'Through': ('0R', 'TR'), 'Schwein': ('XN', 'XFN'), 'dave': ('TF', None), 'ray': ('R', None), 'steven': ('STFN', None), 'bryce': ('PRS', None), 'randy': ('RNT', None), 'bryan': ('PRN', None), 'Rapelje': ('RPL', None), 'brian': ('PRN', None), 'otto': ('AT', None), 'auto': ('AT', None), 'Dallas': ('TLS', None), 'maisey': ('MS', None), 'zhang': ('JNK', None), 'Chile': ('XL', None), 'Jose': ('HS', None), 'Arnow': ('ARN', 'ARNF'), 'solilijs': ('SLLS', None), 'Parachute': ('PRKT', None), 'Nowhere': ('NR', None), 'Tux': ('TKS', None)} dmn = name = None for name in names.keys(): dmn = double_metaphone(name) assert dmn == names[name] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter()) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)] namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF']
class Organization(fields.SchemaClass): # numero_de_da : Numéro de la Déclaration d'Activité - numero_de_da = fields.ID(stored=True, unique=True) # form_total : Nombre de formateurs - form_total = fields.NUMERIC(stored=True) # da_siren : Numéro de SIREN de la structure - da_siren = fields.ID(stored=True, unique=True) # da_no_etab : Numéro d'établissement de la structure - da_no_etab = fields.ID(stored=True) # da_raison_sociale : Raison Sociale - da_raison_sociale = fields.TEXT(stored=True, analyzer=ngram_analyzer, phrase=False) # adr_rue_physique : Voie de l'adresse physique - adr_rue_physique = fields.TEXT(stored=True) # adr_rue_complement_physique : Complément de l'adresse physique - adr_rue_complement_physique = fields.TEXT(stored=True) # adr_code_postal_physique : Code postal de l'adresse physique - adr_code_postal_physique = fields.ID(stored=True) # adr_ville_physique : Ville de l'adresse physique - adr_ville_physique = fields.TEXT(stored=True) # adr_rue_postale : Voie de l'adresse postale - adr_rue_postale = fields.TEXT(stored=True) # adr_rue_complement_postale : Complément de l'adresse postale - adr_rue_complement_postale = fields.TEXT(stored=True) # adr_code_postal_postale : Code postal de l'adresse postale - adr_code_postal_postale = fields.ID(stored=True) # adr_ville_postale : Ville de l'adresse postale adr_ville_postale = fields.TEXT(stored=True)
def test_sort_text_field(): domain = (("Visual Display of Quantitative Information, The", 10), ("Envisioning Information", 10), ("Visual Explanations", 10), ("Beautiful Evidence", -10), ("Visual and Statistical Thinking", -10), ("Cognitive Style of Powerpoint", -10)) sorted_titles = sorted(d[0] for d in domain) schema = fields.Schema(title=fields.TEXT(stored=True, sortable=True), num=fields.NUMERIC(sortable=True)) def test(ix): with ix.searcher() as s: # Sort by title r = s.search(query.Every(), sortedby="title") assert [hit["title"] for hit in r] == sorted_titles # Sort by reverse title facet = sorting.FieldFacet("title", reverse=True) r = s.search(query.Every(), sortedby=facet) assert [hit["title"] for hit in r] == list(reversed(sorted_titles)) # Sort by num (-10 to 10) first, and within that, by reverse title facet = sorting.MultiFacet() facet.add_field("num") facet.add_field("title", reverse=True) r = s.search(query.Every(), sortedby=facet) target = ["Visual and Statistical Thinking", "Cognitive Style of Powerpoint", "Beautiful Evidence", "Visual Explanations", "Visual Display of Quantitative Information, The", "Envisioning Information", ] assert [hit["title"] for hit in r] == target # Single segment ix = RamStorage().create_index(schema) with ix.writer() as w: for title, num in domain: w.add_document(title=u(title), num=num) test(ix) # Multisegment ix = RamStorage().create_index(schema) # Segment 1 with ix.writer() as w: for title, num in domain[:3]: w.add_document(title=u(title), num=num) # Segment 2 with ix.writer() as w: for title, num in domain[3:]: w.add_document(title=u(title), num=num) w.merge = False test(ix)
class WikiSchema(fields.SchemaClass): '''This describes the content that will be stored in the search index.''' # The field boost helps wiki page paths show more prevalently in results # since they will also be used as links in the content of other pages. path = fields.ID(unique=True, field_boost=2.0, stored=True) # The content is stored so that highlights can be extracted to display. content = fields.TEXT(stored=True)
def test_no_add(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, vector=True)) with TempIndex(schema) as ix: with ix.writer(procs=3) as w: assert type(w) == MpWriter
def _mk_schema(self, dsinfo): from whoosh import fields as wf from whoosh.analysis import SimpleAnalyzer # haven for terms that have been found to be undefined # (for faster decision-making upon next encounter) # this will harvest all discovered term definitions definitions = { '@id': 'unique identifier of an entity', # TODO make proper JSON-LD definition 'path': 'path name of an entity relative to the searched base dataset', # TODO make proper JSON-LD definition 'parentds': 'path of the datasets that contains an entity', # 'type' will not come from a metadata field, hence will not be detected 'type': 'type of a record', } schema_fields = { n.lstrip('@'): wf.ID(stored=True, unique=n == '@id') for n in definitions } lgr.debug('Scanning for metadata keys') # quick 1st pass over all dataset to gather the needed schema fields log_progress( lgr.info, 'idxschemabuild', 'Start building search schema', total=len(dsinfo), label='Building search schema', unit=' Datasets', ) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # no stringification of values for speed, we do not need/use the # actual values at this point, only the keys idxd = _meta2autofield_dict(meta, val2str=False) for k in idxd: schema_fields[k] = wf.TEXT(stored=False, analyzer=SimpleAnalyzer()) log_progress(lgr.info, 'idxschemabuild', 'Scanned dataset at %s', res['path'], update=1, increment=True) log_progress(lgr.info, 'idxschemabuild', 'Done building search schema') self.schema = wf.Schema(**schema_fields)
def test_free_dates(): a = analysis.StandardAnalyzer(stoplist=None) schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME) qp = qparser.QueryParser("text", schema) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True)) q = qp.parse(u("hello date:last tuesday")) assert_equal(q.__class__, query.And) assert_equal(len(q), 2) assert_equal(q[0].__class__, query.Term) assert_equal(q[0].text, "hello") assert_equal(q[1].__class__, query.DateRange) assert_equal(q[1].startdate, adatetime(2010, 9, 14).floor()) assert_equal(q[1].enddate, adatetime(2010, 9, 14).ceil()) q = qp.parse(u("date:mar 29 1972 hello")) assert_equal(q.__class__, query.And) assert_equal(len(q), 2) assert_equal(q[0].__class__, query.DateRange) assert_equal(q[0].startdate, adatetime(1972, 3, 29).floor()) assert_equal(q[0].enddate, adatetime(1972, 3, 29).ceil()) assert_equal(q[1].__class__, query.Term) assert_equal(q[1].text, "hello") q = qp.parse(u("date:2005 march 2")) assert_equal(q.__class__, query.DateRange) assert_equal(q.startdate, adatetime(2005, 3, 2).floor()) assert_equal(q.enddate, adatetime(2005, 3, 2).ceil()) q = qp.parse(u("date:'2005' march 2")) assert_equal(q.__class__, query.And) assert_equal(len(q), 3) assert_equal(q[0].__class__, query.DateRange) assert_equal(q[0].startdate, adatetime(2005).floor()) assert_equal(q[0].enddate, adatetime(2005).ceil()) assert_equal(q[1].__class__, query.Term) assert_equal(q[1].fieldname, "text") assert_equal(q[1].text, "march") q = qp.parse(u("date:march 24 to dec 12")) assert_equal(q.__class__, query.DateRange) assert_equal(q.startdate, adatetime(2010, 3, 24).floor()) assert_equal(q.enddate, adatetime(2010, 12, 12).ceil()) q = qp.parse(u("date:5:10pm")) assert_equal(q.__class__, query.DateRange) assert_equal(q.startdate, adatetime(2010, 9, 20, 17, 10).floor()) assert_equal(q.enddate, adatetime(2010, 9, 20, 17, 10).ceil()) q = qp.parse(u("(date:30 june OR date:10 july) quick")) assert_equal(q.__class__, query.And) assert_equal(len(q), 2) assert_equal(q[0].__class__, query.Or) assert_equal(q[0][0].__class__, query.DateRange) assert_equal(q[0][1].__class__, query.DateRange)
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): if not os.path.exists(ixdir): os.mkdir(ixdir) # Multi-lingual stop words stoplist = (analysis.STOP_WORDS | set("de la der und le die et en al no von di du da " "del zur ein".split())) # Schema ana = analysis.StemmingAnalyzer(stoplist=stoplist) schema = fields.Schema(title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), subject=fields.TEXT(analyzer=ana, phrase=False), file=fields.STORED, pos=fields.STORED, ) # MARC fields to extract mfields = set(subjectfields) # Subjects mfields.update("100 110 111".split()) # Author mfields.add("245") # Title print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb)) c = 0 t = now() ix = index.create_in(ixdir, schema) with ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as w: filenames = [filename for filename in os.listdir(basedir) if fnmatch.fnmatch(filename, glob)] for filename in filenames: path = os.path.join(basedir, filename) print("Indexing", path) f = open(path, 'rb') for x, pos in read_file(f, mfields): w.add_document(title=uni(title(x)), author=uni(author(x)), subject=uni(subjects(x)), file=filename, pos=pos) c += 1 f.close() print("Committing...") print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
def test_bypass_stemming2(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document( content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00")) w.add_document(content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study")) w.add_document(content=u("This is the first document we've added!"))
def test_token_boost(): from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter ana = RegexTokenizer() | DoubleMetaphoneFilter() field = fields.TEXT(analyzer=ana, phrase=False) results = sorted(field.index(u("spruce view"))) assert results == [ (b('F'), 1, 1.0, b('\x00\x00\x00\x01')), (b('FF'), 1, 0.5, b('\x00\x00\x00\x01')), (b('SPRS'), 1, 1.0, b('\x00\x00\x00\x01')), ]
def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() seg.set_doc_count(4) pdr = codec.per_document_reader(st, seg) assert pdr.doc_count_all() == 4 assert pdr.stored_fields(0) == {"a": "hello", "b": "there"} # Note: access out of order assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"} assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"} sfs = list(pdr.all_stored_fields()) assert len(sfs) == 4 assert sfs == [ { "a": "hello", "b": "there" }, { "a": "one", "b": "two", "c": "three" }, {}, { "a": "alfa", "b": "bravo" }, ] pdr.close()
def test_term_inspection_segment_reader(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document( title=u"My document", content=u"AA AA BB BB CC AA AA AA BB BB CC DD EE EE") w.add_document(title=u"My other document", content=u"AA AÉ BB CC EE EE Aú AÚ DD") _check_inspection_results(ix)
def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() dr = codec.stored_fields_reader(st, seg) assert_equal(dr[0], {"a": "hello", "b": "there"}) # Note: access out of order assert_equal(dr[3], {"a": "alfa", "b": "bravo"}) assert_equal(dr[1], {"a": "one", "b": "two", "c": "three"}) dr.close() dr = codec.stored_fields_reader(st, seg) sfs = list(dr) assert_equal(sfs, [ { "a": "hello", "b": "there" }, { "a": "one", "b": "two", "c": "three" }, {}, { "a": "alfa", "b": "bravo" }, ]) dr.close()
def test_creation(): s = fields.Schema(content=fields.TEXT(phrase=True), title=fields.TEXT(stored=True), path=fields.ID(stored=True), tags=fields.KEYWORD(stored=True), quick=fields.NGRAM, note=fields.STORED) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(title=u("First"), content=u("This is the first document"), path=u("/a"), tags=u("first second third"), quick=u("First document"), note=u("This is the first document")) w.add_document(content=u("Let's try this again"), title=u("Second"), path=u("/b"), tags=u("Uno Dos Tres"), quick=u("Second document"), note=u("This is the second document")) w.commit()
class TweetSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) url = fields.ID(stored=True, unique=True) text = fields.TEXT(stored=True) source = fields.TEXT(stored=True) reply = fields.BOOLEAN(stored=True) in_reply_to_id = fields.TEXT(stored=True) in_reply_to_name = fields.TEXT(stored=True) user_mentions = fields.KEYWORD(stored=True) hashtags = fields.KEYWORD(stored=True) urls = fields.KEYWORD(stored=True) geo = fields.BOOLEAN(stored=True) latitude = fields.NUMERIC(stored=True) longitude = fields.NUMERIC(stored=True) date = fields.DATETIME(stored=True)
def test_creation1(self): s = fields.Schema() s.add("content", fields.TEXT(phrase=True)) s.add("title", fields.TEXT(stored=True)) s.add("path", fields.ID(stored=True)) s.add("tags", fields.KEYWORD(stored=True)) s.add("quick", fields.NGRAM) s.add("note", fields.STORED) self.assertEqual(s.field_names(), ["content", "title", "path", "tags", "quick", "note"]) self.assert_("content" in s) self.assertFalse("buzz" in s) self.assert_(isinstance(s["tags"], fields.KEYWORD)) self.assert_(isinstance(s[3], fields.KEYWORD)) self.assert_(s[0] is s.field_by_number(0)) self.assert_(s["title"] is s.field_by_name("title")) self.assert_(s.name_to_number("path") == 2) self.assert_(s.number_to_name(4) == "quick") self.assertEqual(s.scorable_fields(), [0, 1, 4])
def test_fuzzy_prefix(): from whoosh import scoring schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: # Match -> first w.add_document(title=u("First"), content=u("This is the first document we've added!")) # No match w.add_document( title=u("Second"), content=u("The second one is even more interesting! filst")) # Match -> first w.add_document(title=u("Third"), content=u("The world first line we've added!")) # Match -> zeroth w.add_document( title=u("Fourth"), content=u("The second one is alaways comes after zeroth!")) # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) from whoosh.qparser import QueryParser, FuzzyTermPlugin parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) q = parser.parse("first~2/3 OR zeroth", debug=False) assert isinstance(q, query.Or) ft = q[0] assert isinstance(ft, query.FuzzyTerm) assert ft.maxdist == 2 assert ft.prefixlength == 3 with ix.searcher(weighting=scoring.TF_IDF()) as searcher: results = searcher.search(q) assert len(results) == 4 assert (" ".join(sorted( hit["title"] for hit in results)) == "Fifth First Fourth Third")
def test_multi_language(): # Analyzer for English ana_eng = analysis.StemmingAnalyzer() # analyzer for Pig Latin def stem_piglatin(w): if w.endswith("ay"): w = w[:-2] return w ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"], stemfn=stem_piglatin) # Dictionary mapping languages to analyzers analyzers = {"eng": ana_eng, "pig": ana_pig} # Fake documents corpus = [(u("eng"), u("Such stuff as dreams are made on")), (u("pig"), u("Otay ebay, roay otnay otay ebay"))] schema = fields.Schema(content=fields.TEXT(stored=True), lang=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for doclang, content in corpus: ana = analyzers[doclang] # "Pre-analyze" the field into token strings words = [token.text for token in ana(content)] # Note we store the original value but index the pre-analyzed words w.add_document(lang=doclang, content=words, _stored_content=content) with ix.searcher() as s: schema = s.schema # Modify the schema to fake the correct analyzer for the language # we're searching in schema["content"].analyzer = analyzers["eng"] qp = qparser.QueryParser("content", schema) q = qp.parse("dreaming") r = s.search(q) assert len(r) == 1 assert r[0]["content"] == "Such stuff as dreams are made on" schema["content"].analyzer = analyzers["pig"] qp = qparser.QueryParser("content", schema) q = qp.parse("otnay") r = s.search(q) assert len(r) == 1 assert r[0]["content"] == "Otay ebay, roay otnay otay ebay"
def test_term_inspection(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document( title=u("My document"), content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) w.add_document(title=u("My other document"), content=u("AA AB BB CC EE EE AX AX DD")) with ix.reader() as r: cterms = " ".join(r.field_terms("content")) assert cterms == "aa ab ax bb cc dd ee" a_exp = list(r.expand_prefix("content", "a")) assert a_exp == [b('aa'), b('ab'), b('ax')] assert set(r.all_terms()) == set([('content', b('aa')), ('content', b('ab')), ('content', b('ax')), ('content', b('bb')), ('content', b('cc')), ('content', b('dd')), ('content', b('ee')), ('title', b('document')), ('title', b('my')), ('title', b('other'))]) # (text, doc_freq, index_freq) cstats = _fstats(r.iter_field("content")) assert cstats == [ (b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2), (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4) ] prestats = _fstats(r.iter_field("content", prefix="c")) assert prestats == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert list(r.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))] assert list(r.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))] assert list(r.most_distinctive_terms("content", 3)) == [ (1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee')) ]
def test_multitoken_phrase(): textfield = fields.TEXT() textfield.multitoken_query = "phrase" schema = fields.Schema(text=textfield) parser = default.QueryParser("text", schema) qstring = u("chaw-bacon") texts = list(schema["text"].process_text(qstring)) assert texts == ["chaw", "bacon"] q = parser.parse(qstring) assert q.__class__ == query.Phrase
def test_bypass_stemming(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("rendering shading modeling reactions")) w.commit() with ix.reader() as r: assert_equal(list(r.lexicon("text")), ["model", "reaction", "render", "shade"]) assert_equal(list(r.word_graph("text").flatten_strings()), ["modeling", "reactions", "rendering", "shading"])
def build_index(self): schema = fields.Schema(keyword=fields.TEXT(stored=True), content=fields.TEXT(stored=True)) if not os.path.exists(config.path_index): os.mkdir(config.path_index) else: shutil.rmtree(config.path_index) os.mkdir(config.path_index) index.create_in(config.path_index, schema) ix = index.open_dir(config.path_index) writer = ix.writer() with open(config.path_dirty_talk, 'r') as fr: for line in fr: for word in jieba_tool.cut(line): print(word) writer.add_document(keyword=word, content=line.strip()) writer.commit()
def test_docwriter_one(): field = fields.TEXT(stored=True) st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("text", field, "Testing one two three", 4) dw.finish_doc() dw.close() seg.set_doc_count(1) pdr = codec.per_document_reader(st, seg) assert pdr.doc_field_length(0, "text") == 4 assert pdr.stored_fields(0) == {"text": "Testing one two three"}
def test_groupby_phrase(): domain = {"Alan Ball": "Tel Aviv", "Alan Charles": "San Francisco", "Alan Darwin": "London", "Alan Eames": "Paris"} schema = fields.Schema(name=fields.TEXT(stored=True), city=fields.TEXT(stored=True), city_g=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for name, city in domain.items(): w.add_document(name=u(name), city=u(city), city_g=u(city)) with ix.searcher() as s: q = query.Term("name", "alan") r = s.search(q, groupedby="city_g") keys = sorted(r.groups().keys()) assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"] sff = sorting.StoredFieldFacet("city") r = s.search(q, groupedby=sff) keys = sorted(r.groups().keys()) assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"]
def _mk_schema(self, dsinfo): from whoosh import fields as wf from whoosh.analysis import StandardAnalyzer # TODO support some customizable mapping to homogenize some metadata fields # onto a given set of index keys self.schema = wf.Schema(id=wf.ID, path=wf.ID(stored=True), type=wf.ID(stored=True), parentds=wf.ID(stored=True), meta=wf.TEXT( stored=False, analyzer=StandardAnalyzer(minsize=2)))
def test_score_retrieval(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(title=u("Miss Mary"), content=u("Mary had a little white lamb its fleece" " was white as snow")) writer.add_document(title=u("Snow White"), content=u("Snow white lived in the forest with seven" " dwarfs")) writer.commit() with ix.searcher() as s: results = s.search(query.Term("content", "white")) assert len(results) == 2 assert results[0]['title'] == u("Miss Mary") assert results[1]['title'] == u("Snow White") assert results.score(0) is not None assert results.score(0) != 0 assert results.score(0) != 1
def test_missing_field_scoring(self): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = store.RamStorage() idx = index.Index(storage, schema, create=True) writer = idx.writer() writer.add_document(name=u'Frank', hobbies=u'baseball, basketball') writer.commit() self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 1) # name writer = idx.writer() writer.add_document(name=u'Jonny') writer.commit() self.assertEqual(len(idx.segments), 1) self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 2) # name parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema) searcher = idx.searcher() result = searcher.search(parser.parse(u'baseball')) self.assertEqual(len(result), 1)
def test_spellable_list(): # Make sure a spellable field works with a list of pre-analyzed tokens ana = analysis.StemmingAnalyzer() schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED, Title=fields.TEXT(spelling=True, analyzer=ana)) ix = RamStorage().create_index(schema) doc = {'Location': '1000/123', 'Lang': 'E', 'Title': ['Introduction', 'Numerical', 'Analysis']} with ix.writer() as w: w.add_document(**doc)
def _create_data_index(): schema = whoosh_fields.Schema( url=whoosh_fields.ID(stored=True, unique=True), type=whoosh_fields.STORED(), title=whoosh_fields.STORED(), description=whoosh_fields.STORED(), org=whoosh_fields.STORED(), subtype=whoosh_fields.STORED(), content=whoosh_fields.TEXT(), ) _ensure_dir(DATA_DIR) assert not whoosh_index.exists_in(DATA_DIR), DATA_DIR return whoosh_index.create_in(DATA_DIR, schema)
def test_nonexistant_fieldnames(): # Need an analyzer that won't mangle a URL a = analysis.SimpleAnalyzer("\\S+") schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a)) qp = default.QueryParser("text", schema) q = qp.parse(u("id:/code http://localhost/")) assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[0].fieldname == "id" assert q[0].text == "/code" assert q[1].__class__ == query.Term assert q[1].fieldname == "text" assert q[1].text == "http://localhost/"