def test_filter(): schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie")) w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta")) w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo")) w.commit(merge=False) w = ix.writer() w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa")) w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo")) w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie")) w.commit(merge=False) w = ix.writer() w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta")) w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo")) w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa")) w.commit(merge=False) with ix.searcher() as s: fq = query.Or([query.Prefix("path", "/a"), query.Prefix("path", "/b")]) r = s.search(query.Term("text", "alfa"), filter=fq) assert_equal([d["id"] for d in r], [1, 4, 5]) r = s.search(query.Term("text", "bravo"), filter=fq) assert_equal([d["id"] for d in r], [1, 2, 5, 7, ])
def test_searching(): with make_index().searcher() as s: def _runq(q, result, **kwargs): r = s.search(q, **kwargs) assert_equal([d["id"] for d in r], result) _runq(query.Term("text", u("format")), ["format", "vector"]) _runq(query.Term("text", u("the")), ["fieldtype", "format", "const", "vector", "stored"]) _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"]) _runq(query.Wildcard("id", u("*st*")), ["stored", "const"]) _runq(query.TermRange("id", u("c"), u("s")), ["fieldtype", "format", "const"]) _runq(query.NumericRange("subs", 10, 100), ["fieldtype", "format", "vector", "scorable"]) _runq(query.Phrase("text", ["this", "field"]), ["scorable", "unique", "stored"], limit=None) _runq(query.Every(), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ]) _runq(query.Every("subs"), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ])
def test_termdocs(): schema = fields.Schema(key=fields.TEXT, city=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(key=u"ant", city=u"london") w.add_document(key=u"anteater", city=u"roma") w.add_document(key=u"bear", city=u"london") w.add_document(key=u"bees", city=u"roma") w.add_document(key=u"anorak", city=u"london") w.add_document(key=u"antimatter", city=u"roma") w.add_document(key=u"angora", city=u"london") w.add_document(key=u"angels", city=u"roma") with ix.searcher() as s: cond_q = query.Term("city", u"london") pref_q = query.Prefix("key", u"an") q = query.And([cond_q, pref_q]).normalize() r = s.search(q, scored=False, terms=True) field = s.schema["key"] terms = [ field.from_bytes(term) for fieldname, term in r.termdocs if fieldname == "key" ] assert sorted(terms) == [u"angora", u"anorak", u"ant"]
def test_multi(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.KEYWORD(stored=True)) with TempIndex(schema, "multi") as ix: writer = ix.writer() # Deleted 1 writer.add_document(id=u("1"), content=u("alfa bravo charlie")) # Deleted 1 writer.add_document(id=u("2"), content=u("bravo charlie delta echo")) # Deleted 2 writer.add_document(id=u("3"), content=u("charlie delta echo foxtrot")) writer.commit() writer = ix.writer() writer.delete_by_term("id", "1") writer.delete_by_term("id", "2") writer.add_document(id=u("4"), content=u("apple bear cherry donut")) writer.add_document(id=u("5"), content=u("bear cherry donut eggs")) # Deleted 2 writer.add_document(id=u("6"), content=u("delta echo foxtrot golf")) # no d writer.add_document(id=u("7"), content=u("echo foxtrot golf hotel")) writer.commit(merge=False) writer = ix.writer() writer.delete_by_term("id", "3") writer.delete_by_term("id", "6") writer.add_document(id=u("8"), content=u("cherry donut eggs falafel")) writer.add_document(id=u("9"), content=u("donut eggs falafel grape")) writer.add_document(id=u("A"), content=u(" foxtrot golf hotel india")) writer.commit(merge=False) assert ix.doc_count() == 6 with ix.searcher() as s: r = s.search(query.Prefix("content", u("d")), optimize=False) assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] r = s.search(query.Prefix("content", u("d"))) assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] r = s.search(query.Prefix("content", u("d")), limit=None) assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"]
def test_quoted_prefix(): qp = default.QueryParser("f", None) expr = r"(^|(?<=[ (]))(?P<text>\w+|[*]):" qp.replace_plugin(plugins.FieldsPlugin(expr)) q = qp.parse(u('foo url:http://apple.com:8080/bar* baz')) assert isinstance(q, query.And) assert q[0] == query.Term("f", "foo") assert q[1] == query.Prefix("url", "http://apple.com:8080/bar") assert q[2] == query.Term("f", "baz") assert len(q) == 3
def test_too_many_prefix_positions(): from whoosh import matching schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: for i in xrange(200): text = u("a%s" % i) w.add_document(id=i, text=text) q = query.Prefix("text", u("a")) q.TOO_MANY_CLAUSES = 100 with ix.searcher() as s: m = q.matcher(s) assert_equal(m.__class__, matching.ListMatcher) assert m.supports("positions") items = list(m.items_as("positions")) assert_equal([(i, [0]) for i in xrange(200)], items)
def test_patterns(): domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo " "alembic all amiga amount ampere").split() schema = fields.Schema(word=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for word in domain: w.add_document(word=word) with ix.reader() as r: assert_equal(list(r.lexicon("word")), domain) assert_equal(list(r.expand_prefix("word", "al")), ["alembic", "all"]) q = query.Prefix("word", "al") assert_equal(q.simplify(r).__unicode__(), "(word:alembic OR word:all)") q = query.Wildcard("word", "a*[ae]") assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:acre OR word:adage OR " + "word:amiga OR word:ampere)") assert_equal(q._find_prefix(q.text), "a") q = query.Regex("word", "am.*[ae]") assert_equal(q.simplify(r).__unicode__(), "(word:amiga OR word:ampere)") assert_equal(q._find_prefix(q.text), "am") q = query.Regex("word", "able|ago") assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:ago)") assert_equal(q._find_prefix(q.text), "") # special case: ? may mean "zero occurences" q = query.Regex("word", "ah?i") assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)") assert_equal(q._find_prefix(q.text), "a") # special case: * may mean "zero occurences" q = query.Regex("word", "ah*i") assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)") assert_equal(q._find_prefix(q.text), "a")
def parse_query(self, fieldname, qstring, boost=1.0): text = self.process_text(qstring) from whoosh import query return query.Prefix(fieldname, text, boost=boost)
def prefix_query(query_string): if len(query_string) == 1: return query.Prefix(_TEXT_FIELD, query_string) else: return query.Prefix(_TEXT_FIELD, query_string)
def make_prefix(self, fieldname, text): fieldname = fieldname or self.default_field text = self._analyze(fieldname, text) return query.Prefix(fieldname, text)
def filter_corpus(corpus_ind_dir, query_list, year_from, year_to): ix = index.open_dir(corpus_ind_dir) #load index with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) term_list_T = [] term_list_Y = [] for t in query_list: t = re.sub(r'[^a-zA-Z0-9_ ]', '', t).lower() splitted = t.split() if len(splitted) > 1: term_list_T.append(query.Phrase("content", splitted)) else: term_list_T.append(query.Term("content", t)) for y in range(year_from, year_to + 1): term_list_Y.append(query.Term("year", str(y))) q1 = query.Or(term_list_T) q2 = query.Or(term_list_Y) q_f = query.And([q1, q2]) results = searcher.search(q_f, limit=None) result_list = [] relevant_article_ids = [] i = 0 for r in results: i += 1 article_id = r["id"].split('_')[0] if not article_id in relevant_article_ids: relevant_article_ids.append(article_id) new_corpus = [] for r_article_id in sorted(relevant_article_ids): article_id = r_article_id + "_" q = query.Prefix("id", article_id) x = 0 row_data = {} for r in searcher.search(q, limit=None): if x == 0: for key in r: if key == "content": row_data["sentences"] = r['content'] x += 1 elif key == "id": row_data["id"] = article_id[:-1] else: row_data[key] = r[key] else: sent = " " + r['content'] row_data["sentences"] += sent new_corpus.append(row_data) pd_save = pd.DataFrame.from_records(new_corpus) cols = ['id'] + [col for col in pd_save if col != 'id'] pd_save = pd_save[cols] return pd_save.to_csv(encoding='utf-8')