def __setupIndexes(self, chunk_size): ''' Sets up RediSearch indexes for chromosomes and genes. Parameters: chunk_size (int): The chunk size to be used for Redis batch processing. Returns: redis.commands.search.Search.BatchIndexer: A batch processor for the chromosome index. redis.commands.search.Search.BatchIndexer: A batch processor for the gene index. ''' # create the chromosome index chromosome_fields = [ TextField('name'), NumericField('length'), TextField('genus'), TextField('species'), ] chromosome_definition = IndexDefinition(prefix=['chromosome:']) chromosome_indexer = \ self.__makeOrGetIndex( CHROMOSOME_INDEX_NAME, chromosome_fields, chromosome_definition, chunk_size, ) # check if any non-RediSearch chromosome keys exist, and drop if necessary self.__checkChromosomeKeys() # create the gene index gene_fields = [ TextField('chromosome'), TextField('name'), NumericField('fmin'), NumericField('fmax'), TextField('family'), NumericField('strand'), NumericField('index', sortable=True), ] gene_definition = IndexDefinition(prefix=['gene:']) gene_indexer = \ self.__makeOrGetIndex( GENE_INDEX_NAME, gene_fields, gene_definition, chunk_size, ) # set the schema version and compatible versions self.redis_connection.set(VERSION_KEY, redis_loader.__schema_version__) self.redis_connection.delete(COMPATIBLE_KEY) self.redis_connection.sadd( COMPATIBLE_KEY, *redis_loader.__compatible_schema_versions__) return chromosome_indexer, gene_indexer
def test_no_index(client): client.ft().create_index(( TextField("field"), TextField("text", no_index=True, sortable=True), NumericField("numeric", no_index=True, sortable=True), GeoField("geo", no_index=True, sortable=True), TagField("tag", no_index=True, sortable=True), )) client.ft().add_document("doc1", field="aaa", text="1", numeric="1", geo="1,1", tag="1") client.ft().add_document("doc2", field="aab", text="2", numeric="2", geo="2,2", tag="2") waitForIndex(client, "idx") res = client.ft().search(Query("@text:aa*")) assert 0 == res.total res = client.ft().search(Query("@field:aa*")) assert 2 == res.total res = client.ft().search(Query("*").sort_by("text", asc=False)) assert 2 == res.total assert "doc2" == res.docs[0].id res = client.ft().search(Query("*").sort_by("text", asc=True)) assert "doc1" == res.docs[0].id res = client.ft().search(Query("*").sort_by("numeric", asc=True)) assert "doc1" == res.docs[0].id res = client.ft().search(Query("*").sort_by("geo", asc=True)) assert "doc1" == res.docs[0].id res = client.ft().search(Query("*").sort_by("tag", asc=True)) assert "doc1" == res.docs[0].id # Ensure exception is raised for non-indexable, non-sortable fields with pytest.raises(Exception): TextField("name", no_index=True, sortable=False) with pytest.raises(Exception): NumericField("name", no_index=True, sortable=False) with pytest.raises(Exception): GeoField("name", no_index=True, sortable=False) with pytest.raises(Exception): TagField("name", no_index=True, sortable=False)
def createIndex(client, num_docs=100, definition=None): try: client.create_index( (TextField("play", weight=5.0), TextField("txt"), NumericField("chapter")), definition=definition, ) except redis.ResponseError: client.dropindex(delete_documents=True) return createIndex(client, num_docs=num_docs, definition=definition) chapters = {} bzfp = TextIOWrapper(bz2.BZ2File(WILL_PLAY_TEXT), encoding="utf8") r = csv.reader(bzfp, delimiter=";") for n, line in enumerate(r): play, chapter, _, text = line[1], line[2], line[4], line[5] key = f"{play}:{chapter}".lower() d = chapters.setdefault(key, {}) d["play"] = play d["txt"] = d.get("txt", "") + " " + text d["chapter"] = int(chapter or 0) if len(chapters) == num_docs: break indexer = client.batch_indexer(chunk_size=50) assert isinstance(indexer, Search.BatchIndexer) assert 50 == indexer.chunk_size for key, doc in chapters.items(): indexer.add_document(key, **doc) indexer.commit()
def test_create_json_with_alias(client): """ Create definition with IndexType.JSON as index type (ON JSON) with two fields with aliases, and use json client to test it. """ definition = IndexDefinition(prefix=["king:"], index_type=IndexType.JSON) client.ft().create_index( (TextField("$.name", as_name="name"), NumericField("$.num", as_name="num")), definition=definition, ) client.json().set("king:1", Path.rootPath(), {"name": "henry", "num": 42}) client.json().set("king:2", Path.rootPath(), { "name": "james", "num": 3.14 }) res = client.ft().search("@name:henry") assert res.docs[0].id == "king:1" assert res.docs[0].json == '{"name":"henry","num":42}' assert res.total == 1 res = client.ft().search("@num:[0 10]") assert res.docs[0].id == "king:2" assert res.docs[0].json == '{"name":"james","num":3.14}' assert res.total == 1 # Tests returns an error if path contain special characters (user should # use an alias) with pytest.raises(Exception): client.ft().search("@$.name:henry")
def test_search_return_fields(client): res = client.json().set( "doc:1", Path.rootPath(), { "t": "riceratops", "t2": "telmatosaurus", "n": 9072, "flt": 97.2 }, ) assert res # create index on definition = IndexDefinition(index_type=IndexType.JSON) SCHEMA = ( TextField("$.t"), NumericField("$.flt"), ) client.ft().create_index(SCHEMA, definition=definition) waitForIndex(client, "idx") total = client.ft().search(Query("*").return_field("$.t", as_field="txt")).docs assert 1 == len(total) assert "doc:1" == total[0].id assert "riceratops" == total[0].txt total = client.ft().search( Query("*").return_field("$.t2", as_field="txt")).docs assert 1 == len(total) assert "doc:1" == total[0].id assert "telmatosaurus" == total[0].txt
def test_aggregations_apply(client): client.ft().create_index(( TextField("PrimaryKey", sortable=True), NumericField("CreatedDateTimeUTC", sortable=True), )) client.ft().client.hset( "doc1", mapping={ "PrimaryKey": "9::362330", "CreatedDateTimeUTC": "637387878524969984" }, ) client.ft().client.hset( "doc2", mapping={ "PrimaryKey": "9::362329", "CreatedDateTimeUTC": "637387875859270016" }, ) req = aggregations.AggregateRequest("*").apply( CreatedDateTimeUTC="@CreatedDateTimeUTC * 10") res = client.ft().aggregate(req) assert res.rows[0] == ["CreatedDateTimeUTC", "6373878785249699840"] assert res.rows[1] == ["CreatedDateTimeUTC", "6373878758592700416"]
def import_brewery_geo(r, rsclient): # create the brewery redisearch index ftidxfields = [ TextField('name', weight=5.0), TextField('address'), TextField('city'), TextField('state'), TextField('country'), NumericField('id', sortable=True), GeoField('location') ] rsclient.create_index([*ftidxfields]) with open(brewerygeofile) as geofile: geo = csv.reader(geofile) for row in geo: if geo.line_num == 1: # skip the header line continue # use the brewery id to generate the brewery key added earlier brewery_key = "{}:{}".format(brewery, row[1]) # get all the data from the brewery hash binfo = r.hgetall(brewery_key) if not (any(binfo)): print( "\tERROR: Missing info for {}, skipping geo import".format( brewery_key)) continue # add the brewery document to the index ftaddfields = { 'name': binfo[b'name'].decode(), 'address': binfo[b'address1'].decode(), 'city': binfo[b'city'].decode(), 'state': binfo[b'state'].decode(), 'country': binfo[b'country'].decode(), 'id': row[1], 'location': "{},{}".format(row[3], row[2]) } try: rsclient.add_document("brewery:{}".format(row[1]), score=1.0, replace=True, partial=True, **ftaddfields) except Exception as e: print("\tERROR: Failed to add document for {}: {}".format( brewery_key, e)) continue
def test_sort_by(client): client.ft().create_index( (TextField("txt"), NumericField("num", sortable=True))) client.ft().add_document("doc1", txt="foo bar", num=1) client.ft().add_document("doc2", txt="foo baz", num=2) client.ft().add_document("doc3", txt="foo qux", num=3) # Test sort q1 = Query("foo").sort_by("num", asc=True).no_content() q2 = Query("foo").sort_by("num", asc=False).no_content() res1, res2 = client.ft().search(q1), client.ft().search(q2) assert 3 == res1.total assert "doc1" == res1.docs[0].id assert "doc2" == res1.docs[1].id assert "doc3" == res1.docs[2].id assert 3 == res2.total assert "doc1" == res2.docs[2].id assert "doc2" == res2.docs[1].id assert "doc3" == res2.docs[0].id
async def test_filters(modclient: redis.Redis): await ( modclient.ft().create_index( (TextField("txt"), NumericField("num"), GeoField("loc")) ) ) await ( modclient.ft().add_document( "doc1", txt="foo bar", num=3.141, loc="-0.441,51.458" ) ) await modclient.ft().add_document("doc2", txt="foo baz", num=2, loc="-0.1,51.2") await waitForIndex(modclient, "idx") # Test numerical filter q1 = Query("foo").add_filter(NumericFilter("num", 0, 2)).no_content() q2 = ( Query("foo") .add_filter(NumericFilter("num", 2, NumericFilter.INF, minExclusive=True)) .no_content() ) res1, res2 = await modclient.ft().search(q1), await modclient.ft().search(q2) assert 1 == res1.total assert 1 == res2.total assert "doc2" == res1.docs[0].id assert "doc1" == res2.docs[0].id # Test geo filter q1 = Query("foo").add_filter(GeoFilter("loc", -0.44, 51.45, 10)).no_content() q2 = Query("foo").add_filter(GeoFilter("loc", -0.44, 51.45, 100)).no_content() res1, res2 = await modclient.ft().search(q1), await modclient.ft().search(q2) assert 1 == res1.total assert 2 == res2.total assert "doc1" == res1.docs[0].id # Sort results, after RDB reload order may change res = [res2.docs[0].id, res2.docs[1].id] res.sort() assert ["doc1", "doc2"] == res
def test_aggregations_filter(client): client.ft().create_index(( TextField("name", sortable=True), NumericField("age", sortable=True), )) client.ft().client.hset("doc1", mapping={"name": "bar", "age": "25"}) client.ft().client.hset("doc2", mapping={"name": "foo", "age": "19"}) req = aggregations.AggregateRequest("*").filter( "@name=='foo' && @age < 20") res = client.ft().aggregate(req) assert len(res.rows) == 1 assert res.rows[0] == ["name", "foo", "age", "19"] req = aggregations.AggregateRequest("*").filter("@age > 15").sort_by( "@age") res = client.ft().aggregate(req) assert len(res.rows) == 2 assert res.rows[0] == ["age", "19"] assert res.rows[1] == ["age", "25"]
def test_fields_as_name(client): # create index SCHEMA = ( TextField("$.name", sortable=True, as_name="name"), NumericField("$.age", as_name="just_a_number"), ) definition = IndexDefinition(index_type=IndexType.JSON) client.ft().create_index(SCHEMA, definition=definition) # insert json data res = client.json().set("doc:1", Path.rootPath(), { "name": "Jon", "age": 25 }) assert res total = client.ft().search( Query("Jon").return_fields("name", "just_a_number")).docs assert 1 == len(total) assert "doc:1" == total[0].id assert "Jon" == total[0].name assert "25" == total[0].just_a_number
def test_aggregations_groupby(client): # Creating the index definition and schema client.ft().create_index(( NumericField("random_num"), TextField("title"), TextField("body"), TextField("parent"), )) # Indexing a document client.ft().add_document( "search", title="RediSearch", body="Redisearch impements a search engine on top of redis", parent="redis", random_num=10, ) client.ft().add_document( "ai", title="RedisAI", body= "RedisAI executes Deep Learning/Machine Learning models and managing their data.", # noqa parent="redis", random_num=3, ) client.ft().add_document( "json", title="RedisJson", body= "RedisJSON implements ECMA-404 The JSON Data Interchange Standard as a native data type.", # noqa parent="redis", random_num=8, ) req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.count(), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "3" req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.count_distinct("@title"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "3" req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.count_distinctish("@title"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "3" req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.sum("@random_num"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "21" # 10+8+3 req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.min("@random_num"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "3" # min(10,8,3) req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.max("@random_num"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "10" # max(10,8,3) req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.avg("@random_num"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "7" # (10+3+8)/3 req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.stddev("random_num"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "3.60555127546" req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.quantile("@random_num", 0.5), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == "8" # median of 3,8,10 req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.tolist("@title"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[3] == ["RediSearch", "RedisAI", "RedisJson"] req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.first_value("@title").alias("first"), ) res = client.ft().aggregate(req).rows[0] assert res == ["parent", "redis", "first", "RediSearch"] req = aggregations.AggregateRequest("redis").group_by( "@parent", reducers.random_sample("@title", 2).alias("random"), ) res = client.ft().aggregate(req).rows[0] assert res[1] == "redis" assert res[2] == "random" assert len(res[3]) == 2 assert res[3][0] in ["RediSearch", "RedisAI", "RedisJson"]
def ftadd_beers(r, rsclient): # create beer index ftidxfields = [ TextField('name', weight=5.0), TextField('brewery'), NumericField('breweryid', sortable=True), TextField('category'), NumericField('categoryid'), TextField('style'), NumericField('styleid'), TextField('description'), NumericField('abv', sortable=True), NumericField('ibu', sortable=True), TagField('favorite') ] rsclient.create_index([*ftidxfields]) header = [] dontadd = 0 with open(beerfile) as csvfile: beers = csv.reader(csvfile) for row in beers: docid = '' docscore = 1.0 ftaddfields = {} if beers.line_num == 1: header = row continue for idx, field in enumerate(row): if idx == 0: docid = "{}:{}".format(beer, field) continue # idx 1 is brewery name if idx == 1: if field == "": # something is wrong with the csv, skip this line. print("\tEJECTING: {}".format(row)) dontadd = 1 break bkey = "{}:{}".format(brewery, field) ftaddfields['brewery'] = r.hget(bkey, 'name') ftaddfields['breweryid'] = field # idx 2 is beer name elif idx == 2: ftaddfields['name'] = field # idx 3 is category ID elif idx == 3: catname = 'None' if int(field) != -1: # get the category key and hget the name of the category ckey = "{}:{}".format(category, field) catname = r.hget(ckey, 'cat_name') ftaddfields['category'] = catname ftaddfields['categoryid'] = field # idx 4 is style ID elif idx == 4: stylename = 'None' if int(field) != -1: skey = "{}:{}".format(style, field) stylename = r.hget(skey, 'style_name') ftaddfields['style'] = stylename ftaddfields['styleid'] = field # idx 5 is ABV elif idx == 5: ftaddfields['abv'] = field # update the document score based on ABV docscore = get_beer_doc_score(field) # idx 6 is IBU elif idx == 6: ftaddfields['ibu'] = field if dontadd: dontadd = 0 continue # add beer document rsclient.add_document(docid, score=docscore, **ftaddfields)