def test_reverse_collapse(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT, path=fields.ID(stored=True), tags=fields.KEYWORD, order=fields.NUMERIC(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(title=u"First document", content=u"This is my document!", path=u"/a", tags=u"first", order=20.0) w.add_document(title=u"Second document", content=u"This is the second example.", path=u"/b", tags=u"second", order=12.0) w.add_document(title=u"Third document", content=u"Examples are many.", path=u"/c", tags=u"third", order=15.0) w.add_document(title=u"Thirdish document", content=u"Examples are too many.", path=u"/d", tags=u"third", order=25.0) with ix.searcher() as s: q = query.Every('content') r = s.search(q) assert [hit["path"] for hit in r] == ["/a", "/b", "/c", "/d"] q = query.Or([ query.Term("title", "document"), query.Term("content", "document"), query.Term("tags", "document") ]) cf = sorting.FieldFacet("tags") of = sorting.FieldFacet("order", reverse=True) r = s.search(q, collapse=cf, collapse_order=of, terms=True) assert [hit["path"] for hit in r] == ["/a", "/b", "/d"]
def test_contains(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("alfa sierra tango")) w.add_document(text=u("bravo charlie delta")) w.add_document(text=u("charlie delta echo")) w.add_document(text=u("delta echo foxtrot")) w.commit() q = query.Or([query.Term("text", "bravo"), query.Term("text", "charlie")]) r = ix.searcher().search(q, terms=True) for hit in r: assert not hit.contains_term("text", "alfa") assert (hit.contains_term("text", "bravo") or hit.contains_term("text", "charlie")) assert not hit.contains_term("text", "foxtrot")
def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query terms = [query.Term(fieldname, g) for g in self.process_text(qstring, mode='query')] cls = query.Or if self.queryor else query.And return cls(terms, boost=boost)
def lookup(self, source_language, target_language, text, user, project, use_shared): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), self.get_filter(user, project, use_shared, True), ]) text_query = self.parser.parse(text) matches = self.searcher.search(text_query, filter=langfilter, limit=20000) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield (match['source'], match['target'], similarity, match['category'], match['origin'])
def get_attachments_from_dms(community): svc = current_app.services['indexing'] filters = wq.And([ wq.Term('community_id', community.id), wq.Term('object_type', Document.entity_type) ]) sortedby = whoosh.sorting.FieldFacet('created_at', reverse=True) documents = svc.search(u'', filter=filters, sortedby=sortedby, limit=50) attachments = [] for doc in documents: url = url_for(doc) attachment = Attachment(url, doc['name'], doc['owner_name'], doc['created_at'], doc.get('content_length'), doc.get('content_type', u'')) attachments.append(attachment) return attachments
def lookup(self, source_language, target_language, text): langfilter = query.And([ query.Term('source_language', source_language), query.Term('target_language', target_language), ]) self.open_searcher() text_query = self.parser.parse(text) matches = self.searcher.search( text_query, filter=langfilter, limit=20000 ) for match in matches: similarity = self.comparer.similarity(text, match['source']) if similarity < 30: continue yield ( match['source'], match['target'], similarity, match['origin'] )
def test_can_apply_filter_and_facet(self): self.whoosh_backend.add_doc(dict(id="1", type="ticket")) self.whoosh_backend.add_doc(dict(id="2", type="wiki")) result = self.whoosh_backend.query(query.Every(), filter=query.Term("type", "ticket"), facets=["type"]) self.print_result(result) self.assertEqual(1, result.hits) self.assertEqual("ticket", result.docs[0]["type"])
def test_unstored(): schema = fields.Schema(text=fields.TEXT, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("alfa bravo charlie"), tags=u("delta echo")) w.commit() hit = ix.searcher().search(query.Term("text", "bravo"))[0] assert_raises(KeyError, hit.highlights, "tags")
def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join( s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo"
def test_sequence(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, text=u("alfa bravo charlie delta echo")) w.add_document(id=1, text=u("bravo charlie delta echo alfa")) w.add_document(id=2, text=u("charlie delta echo bravo")) w.add_document(id=3, text=u("delta echo charlie")) w.add_document(id=4, text=u("echo delta")) with ix.searcher() as s: seq = query.Sequence([query.Term("text", u("echo")), query.Term("text", u("alfa"))]) q = query.And([query.Term("text", "bravo"), seq]) r = s.search(q, limit=4) assert len(r) == 1 assert r[0]["id"] == 1
def test_nested_skip(): schema = fields.Schema( id=fields.ID(unique=True, stored=True), name=fields.TEXT(stored=True), name_ngrams=fields.NGRAMWORDS(minsize=4, field_boost=1.2), type=fields.TEXT, ) domain = [(u"book_1", u"The Dark Knight Returns", u"book"), (u"chapter_1", u"The Dark Knight Returns", u"chapter"), (u"chapter_2", u"The Dark Knight Triumphant", u"chapter"), (u"chapter_3", u"Hunt the Dark Knight", u"chapter"), (u"chapter_4", u"The Dark Knight Falls", u"chapter")] with TempIndex(schema) as ix: with ix.writer() as w: for id, name, typ in domain: w.add_document(id=id, name=name, name_ngrams=name, type=typ) with ix.searcher() as s: all_parents = query.Term("type", "book") wanted_parents = query.Term("name", "dark") children_of_wanted_parents = query.NestedChildren( all_parents, wanted_parents) r1 = s.search(children_of_wanted_parents) assert r1.scored_length() == 4 assert [hit["id"] for hit in r1 ] == ["chapter_1", "chapter_2", "chapter_3", "chapter_4"] wanted_children = query.And( [query.Term("type", "chapter"), query.Term("name", "hunt")]) r2 = s.search(wanted_children) assert r2.scored_length() == 1 assert [hit["id"] for hit in r2] == ["chapter_3"] complex_query = query.And( [children_of_wanted_parents, wanted_children]) r3 = s.search(complex_query) assert r3.scored_length() == 1 assert [hit["id"] for hit in r3] == ["chapter_3"]
def test_missing(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Deleter")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("delete")) with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r ] == ["Calculator", "Index", "Accumulator", "Deleter"] with ix.writer() as w: w.delete_by_term("name", "Accumulator") w.delete_by_term("name", "Calculator") with ix.searcher() as s: pq = query.Term("kind", "class") assert len(list(pq.docs(s))) == 2 q = query.NestedParent(pq, query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Index", "Deleter"]
def document_numbers(self, **kw): """Returns a generator of the document numbers for documents matching the given keyword arguments, where the keyword keys are field names and the values are terms that must appear in the field. >>> docnums = list(searcher.document_numbers(emailto=u"*****@*****.**")) """ q = query.And([query.Term(k, v) for k, v in kw.iteritems()]) return q.docs(self)
def get_filter(user, project, use_shared, use_file): """Create query to filter categories based on selection.""" # Always include file imported memory if use_file: category_filter = [query.Term('category', CATEGORY_FILE)] else: category_filter = [] # Per user memory if user: category_filter.append( query.Term('category', CATEGORY_USER_OFFSET + user.id)) # Private project memory if project: category_filter.append( query.Term('category', CATEGORY_PRIVATE_OFFSET + project.id)) # Shared memory if use_shared: category_filter.append(query.Term('category', CATEGORY_SHARED)) return query.Or(category_filter)
def test_current_terms(): domain = u("alfa bravo charlie delta").split() schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for ls in permutations(domain, 3): w.add_document(text=" ".join(ls), _stored_text=ls) w.commit() with ix.searcher() as s: q = query.And( [query.Term("text", "alfa"), query.Term("text", "charlie")]) m = q.matcher(s) while m.is_active(): assert sorted(m.matching_terms()) == [("text", b("alfa")), ("text", b("charlie"))] m.next()
def test_lengths2(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) count = 0 for _ in xrange(3): w = ix.writer() for ls in permutations(u("alfa bravo charlie").split()): if "bravo" in ls and "charlie" in ls: count += 1 w.add_document(text=u(" ").join(ls)) w.commit(merge=False) with ix.searcher() as s: q = query.Or([query.Term("text", u("bravo")), query.Term("text", u("charlie"))]) r = s.search(q, limit=None) assert len(r) == count r = s.search(q, limit=3) assert len(r) == count
def test_or_nots1(): # Issue #285 schema = fields.Schema(a=fields.KEYWORD(stored=True), b=fields.KEYWORD(stored=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(a=u("alfa"), b=u("charlie")) with ix.searcher() as s: q = query.And([ query.Term("a", "alfa"), query.Or([ query.Not(query.Term("b", "bravo")), query.Not(query.Term("b", "charlie")) ]) ]) r = s.search(q) assert len(r) == 1
def test_indexing(): ix = make_index() with ix.searcher() as s: q = query.Term("text", "format") r = s.search(q) assert_equal(len(r), 2) assert_equal(r[0]["id"], "format") assert_equal(r[0]["subs"], 100) assert_equal(r[1]["id"], "vector") assert_equal(r[1]["subs"], 23)
def highlighted(self): engine = SearchEngine(self.indexpath) engine.open_index() searcher, _queryparser = engine.find() results = searcher.search(query.Term('content', self.q), limit=10) url = []; content = [] for hit in results: url.append(hit["url"]) content.append(hit.highlights("content")) return (url, content)
def search(self, keyword, notebook_id=None): with self.index.searcher() as searcher: query_parser = MultifieldParser( ["title", "snippet"], schema=self.index.schema).parse(keyword) notebook_filter = query.Term("notebook_id", notebook_id) if notebook_id else None results = searcher.search(query_parser, filter=notebook_filter, limit=None) return [res['note_id'] for res in results]
def eval_get_ranked_set_baseline(self, basefile): # Step 1: Read the saved keyterms for a subset of articles # (created by analyze_baseline_queries) g = Graph() g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3") articles = {} for (s, p, o) in g: if not str(s) in articles: articles[str(s)] = [] articles[str(s)].append(str(o)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, create a query for each article based on # the keyterms. connector = query.Or indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index']) storage = FileStorage(indexdir) idx = storage.open_index() searcher = idx.searcher(weighting=scoring.BM25F()) res = {} # for article in sorted(articles.keys()): for article in self._articles(basefile): terms = articles[article] rankedset = [] #parser = qparser.QueryParser("content", idx.schema) #q = parser.parse(connector.join(terms)) q = query.And([ # query.Term("articles", article), connector([query.Term("content", x) for x in terms]) ]) # print q # self.log.debug("Article %s: %s", article, " or ".join(terms)) results = searcher.search(q, limit=None) resultidx = 0 # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10)) for result in results: reslbl = "%s (%s)" % (result['basefile'], results.score(resultidx)) rankedset.append([result['basefile'], reslbl]) # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx))) resultidx += 1 self.log.info( "Created baseline ranked set for %s: Top result %s (of %s)" % (article.split("/")[-1], rankedset[0][0], len(rankedset))) # return just a list of URIs, no scoring information. But the # full URI isnt available in the whoosh db, so we recreate it. res[article] = [ "http://lagen.nu/ext/celex/%s" % x[0] for x in rankedset ] return res
def test_sorted_extend(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), keywords=fields.TEXT, num=fields.NUMERIC(stored=True, sortable=True)) domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split() keys = u"juliet kilo lima november oskar papa quebec romeo".split() combined = 0 tcount = 0 kcount = 0 with TempIndex(schema) as ix: with ix.writer() as w: for i, words in enumerate(permutations(domain, 3)): key = keys[i % (len(domain) - 1)] if "bravo" in words: tcount += 1 if key == "kilo": kcount += 1 if "bravo" in words or key == "kilo": combined += 1 w.add_document(title=u" ".join(words), keywords=key, num=i) with ix.searcher() as s: facet = sorting.MultiFacet([ sorting.FieldFacet("num", reverse=True), sorting.ScoreFacet() ]) r1 = s.search(query.Term("title", "bravo"), limit=None, sortedby=facet) r2 = s.search(query.Term("keywords", "kilo"), limit=None, sortedby=facet) assert len(r1) == tcount assert len(r2) == kcount r1.extend(r2) assert len(r1) == combined
def test_multireader(): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u("alfa"), content=u("alfa")) w.add_document(id=u("bravo"), content=u("bravo")) w.add_document(id=u("charlie"), content=u("charlie")) w.add_document(id=u("delta"), content=u("delta")) w.add_document(id=u("echo"), content=u("echo")) w.add_document(id=u("foxtrot"), content=u("foxtrot")) w.add_document(id=u("golf"), content=u("golf")) w.add_document(id=u("hotel"), content=u("hotel")) w.add_document(id=u("india"), content=u("india")) w.commit() with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo") w = ix.writer() w.add_document(id=u("juliet"), content=u("juliet")) w.add_document(id=u("kilo"), content=u("kilo")) w.add_document(id=u("lima"), content=u("lima")) w.add_document(id=u("mike"), content=u("mike")) w.add_document(id=u("november"), content=u("november")) w.add_document(id=u("oscar"), content=u("oscar")) w.add_document(id=u("papa"), content=u("papa")) w.add_document(id=u("quebec"), content=u("quebec")) w.add_document(id=u("romeo"), content=u("romeo")) w.commit() assert_equal(len(ix._segments()), 2) #r = ix.reader() #assert r.__class__.__name__, "MultiReader") #pr = r.postings("content", u("bravo")) with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo")
def related(self, kitab, vrr, nodeIdNum): dn, kt = self.keyterms(kitab, vrr, nodeIdNum) if not dn: return None for t, r in kt: print "term=", t, " @ rank=", r q = query.Or([query.Term("content", t) for (t, r) in kt]) results = self.indexer.searcher().search(q, limit=10) for i, fields in enumerate(results): if results.docnum(i) != dn: print fields['kitab'], "\t\t", str( fields['nodeIdNum']), "\t\t", fields['title']
def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query text = None if qstring in self.falses: text = self.strings[0] elif qstring in self.trues: text = self.strings[1] if text is None: return query.NullQuery return query.Term(fieldname, text, boost=boost)
def query_a(r, list_a, e1_type, e2_type, index): idx = open_dir(index) entity1 = "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">" entity2 = "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">" t1 = query.Term("sentence", entity1) t2 = query.Term("sentence", r.patterns) t3 = query.Term("sentence", entity2) q1 = spans.SpanNear2([t1, t2, t3], slop=5, ordered=True) q2 = spans.SpanNear2([t1, t3], slop=5, ordered=True) with idx.searcher() as searcher: entities_r = searcher.search(q1) entities = searcher.search(q2) # TODO: fazer stemming ou normalização da palavra a usar no query if len(entities) > 0: pmi = float(len(entities_r)) / float(len(entities)) # TODO: qual o melhor valor de threshold ? if pmi >= 0.5: #print entity1, '\t', r.patterns, '\t', entity2, pmi list_a.append(r)
def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query from whoosh.support.times import is_ambiguous at = self._parse_datestring(qstring) if is_ambiguous(at): startnum = datetime_to_long(at.floor()) endnum = datetime_to_long(at.ceil()) return query.NumericRange(fieldname, startnum, endnum) else: return query.Term(fieldname, self.to_text(at), boost=boost)
def search(self, type, pattern, start=0, rows=10, default_field='text'): assert type in ('idea', 'user'), "Type %r not supported" % type with self._index.searcher() as searcher: q = QueryParser(default_field, schema=self._index.schema).parse(pattern) results = searcher.search(q, limit=max(start + rows, 1), filter=query.Term('type', type)) return ([item['id'] for item in results[start:start + rows]], len(results))
def run(self): print(self.name + " starting") for _ in xrange(10): ix = st.open_index() s = ix.searcher() q = query.Term("content", random.choice(domain)) s.search(q, limit=10) s.close() ix.close() time.sleep(0.1) print(self.name + " done")
def test_filter_results_count(): schema = fields.Schema(id=fields.STORED, django_ct=fields.ID(stored=True), text=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(id=1, django_ct=u("app.model1"), text=u("alfa bravo charlie")) w.add_document(id=2, django_ct=u("app.model1"), text=u("alfa bravo delta")) w.add_document(id=3, django_ct=u("app.model2"), text=u("alfa charlie echo")) with ix.searcher() as s: q = query.Term("django_ct", u("app.model1")) r1 = s.search(q, limit=None) assert len(r1) == 2 q = query.Term("text", u("alfa")) r2 = s.search(q, filter=r1, limit=1) assert len(r2) == 2