def test_add_sortable(): st = RamStorage() schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC) ix = st.create_index(schema) with ix.writer() as w: w.add_document(chapter=u("alfa"), price=100) w.add_document(chapter=u("bravo"), price=200) w.add_document(chapter=u("charlie"), price=300) w.add_document(chapter=u("delta"), price=400) with ix.writer() as w: w.add_document(chapter=u("bravo"), price=500) w.add_document(chapter=u("alfa"), price=600) w.add_document(chapter=u("delta"), price=100) w.add_document(chapter=u("charlie"), price=200) w.merge = False with ix.reader() as r: assert not r.has_column("chapter") assert not r.has_column("price") with ix.writer() as w: sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter")) sorting.add_sortable(w, "price", sorting.FieldFacet("price")) w.schema.test = 100 with ix.reader() as r: assert r.has_column("chapter") assert r.has_column("price") chapr = r.column_reader("chapter") pricer = r.column_reader("price") assert chapr[0] == "alfa" assert pricer[0] == 100
def test_groupby_phrase(): domain = { "Alan Ball": "Tel Aviv", "Alan Charles": "San Francisco", "Alan Darwin": "London", "Alan Eames": "Paris" } schema = fields.Schema(name=fields.TEXT(stored=True), city=fields.TEXT(stored=True), city_g=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for name, city in domain.items(): w.add_document(name=u(name), city=u(city), city_g=u(city)) with ix.searcher() as s: q = query.Term("name", "alan") r = s.search(q, groupedby="city_g") keys = sorted([str(x) for x in r.groups().keys()]) assert_equal(keys, ["London", "Paris", "San Francisco", "Tel Aviv"]) sff = sorting.StoredFieldFacet("city") r = s.search(q, groupedby=sff) keys = sorted([str(x) for x in r.groups().keys()]) assert_equal(keys, ["London", "Paris", "San Francisco", "Tel Aviv"])
def search(self, query, limit=10, ranking=CUSTOM): """Returns a list of sorted Document based on query""" logger.info( "Received search request: Query: %s | Limit: %d | Ranking: %s", query, limit, ranking) try: scoring_method = self.scorers_dict[ranking] except KeyError: logger.error("Invalid ranking: %s", ranking) raise ValueError("Ranking must be one of these: %s", ', '.join(self.rankings)) docs = [] with self.ix.searcher(weighting=scoring_method) as s: if ranking == SearchEngine.CUSTOM: q = self.qp_custom.parse(query) results = s.search(q, limit=max(limit, 100)) if not results.is_empty(): # max_score = max([r.score for r in results]) max_pagerank = math.log10( max([r.fields()["pagerank"] for r in results]) + 1) result_list = [] for result in results: fields = result.fields() # result.score = result.score/max_score pagerank_normalised = math.log10(fields["pagerank"] + 1) / max_pagerank result.score = 0.6 * result.score + 0.4 * pagerank_normalised result_list.append(result) result_list.sort(key=lambda x: x.score, reverse=True) for i, result in enumerate(result_list[:limit]): doc = Document(**result.fields()) docs.append(doc) # print(str(i) + ". ", doc.url, result.score, result.rank, result.combined, doc.pagerank) return docs else: facet = None reverse = False q = self.qp.parse(query) if ranking == SearchEngine.PAGERANK: reverse = True facet = sorting.StoredFieldFacet('pagerank') results = s.search(q, limit=limit, sortedby=facet, reverse=reverse) logger.info("\tMatched docs: %d", len(results)) logger.info("\tScored docs: %d", results.scored_length()) for result in results: docs.append(Document(**result.fields())) return docs
def test_nested_children(): schema = fields.Schema(t=fields.ID(stored=True), track=fields.NUMERIC(stored=True), album_name=fields.TEXT(stored=True), song_name=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(t=u("album"), album_name=u("alfa bravo charlie")) w.add_document(t=u("track"), track=1, song_name=u("delta echo foxtrot")) w.add_document(t=u("track"), track=2, song_name=u("golf hotel india")) w.add_document(t=u("track"), track=3, song_name=u("juliet kilo lima")) with w.group(): w.add_document(t=u("album"), album_name=u("mike november oskar")) w.add_document(t=u("track"), track=1, song_name=u("papa quebec romeo")) w.add_document(t=u("track"), track=2, song_name=u("sierra tango ultra")) w.add_document(t=u("track"), track=3, song_name=u("victor whiskey xray")) with w.group(): w.add_document(t=u("album"), album_name=u("yankee zulu one")) w.add_document(t=u("track"), track=1, song_name=u("two three four")) w.add_document(t=u("track"), track=2, song_name=u("five six seven")) w.add_document(t=u("track"), track=3, song_name=u("eight nine ten")) with ix.searcher() as s: pq = query.Term("t", "album") aq = query.Term("album_name", "november") r = s.search(query.NestedChildren(pq, pq), limit=None) assert len(r) == 9 assert [str(hit["t"]) for hit in r] == ["track"] * 9 ncq = query.NestedChildren(pq, aq) assert list(ncq.docs(s)) == [5, 6, 7] r = s.search(ncq, limit=None) assert len(r) == 3 assert [str(hit["song_name"]) for hit in r] == [ "papa quebec romeo", "sierra tango ultra", "victor whiskey xray" ] zq = query.NestedChildren(pq, query.Term("album_name", "zulu")) f = sorting.StoredFieldFacet("song_name") r = s.search(zq, sortedby=f) assert [hit["track"] for hit in r] == [3, 2, 1]
if reindex or not index.exists_in(dirname): tags = [] for _ in xrange(tagcount): tag = u"".join( random.choice(string.ascii_lowercase) for _ in xrange(5)) tags.append(tag) ix = index.create_in(dirname, schema) t = now() with ix.writer() as w: for i in xrange(doccount): doc = u" ".join(random.sample(tags, random.randint(10, 20))) w.add_document(tags=doc) if not i % 10000: print i print now() - t ix = index.open_dir(dirname) with ix.searcher() as s: tags = list(s.lexicon("tags")) facet = sorting.FieldFacet("tags", allow_overlap=True) qtag = random.choice(tags) print "tag=", qtag q = query.Term("tags", qtag) r = s.search(q, groupedby={"tags": facet}) print r.runtime facet = sorting.StoredFieldFacet("tags", allow_overlap=True) r = s.search(q, groupedby={"tags": facet}) print r.runtime