def testPostChunkedRefDataset(self): print("testPostChunkedRefDataset", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) hdf5_sample_bucket = config.get("hdf5_sample_bucket") if not hdf5_sample_bucket: print("hdf5_sample_bucket config not set, skipping testChunkedRefDataset") return s3path = "s3://" + hdf5_sample_bucket + "/data/hdf5test" + "/snp500.h5" SNP500_ROWS = 3207353 snp500_json = helper.getHDF5JSON("snp500.json") if not snp500_json: print("snp500.json file not found, skipping testPostChunkedRefDataset") return if "snp500.h5" not in snp500_json: self.assertTrue(False) chunk_dims = [60000,] # chunk layout used in snp500.h5 file chunk_info = snp500_json["snp500.h5"] dset_info = chunk_info["/dset"] if "byteStreams" not in dset_info: self.assertTrue(False) byteStreams = dset_info["byteStreams"] # construct map of chunks chunks = {} for item in byteStreams: index = item["index"] chunk_key = str(index) chunks[chunk_key] = (item["file_offset"], item["size"]) # get domain req = helper.getEndpoint() + '/' rsp = requests.get(req, headers=headers) rspJson = json.loads(rsp.text) self.assertTrue("root" in rspJson) root_uuid = rspJson["root"] # define types we need s10_type = {"charSet": "H5T_CSET_ASCII", "class": "H5T_STRING", "length": 10, "strPad": "H5T_STR_NULLPAD" } s4_type = {"charSet": "H5T_CSET_ASCII", "class": "H5T_STRING", "length": 4, "strPad": "H5T_STR_NULLPAD" } fields = ({'name': 'date', 'type': s10_type}, {'name': 'symbol', 'type': s4_type}, {'name': 'sector', 'type': 'H5T_STD_I8LE'}, {'name': 'open', 'type': 'H5T_IEEE_F32LE'}, {'name': 'high', 'type': 'H5T_IEEE_F32LE'}, {'name': 'low', 'type': 'H5T_IEEE_F32LE'}, {'name': 'volume', 'type': 'H5T_IEEE_F32LE'}, {'name': 'close', 'type': 'H5T_IEEE_F32LE'}) datatype = {'class': 'H5T_COMPOUND', 'fields': fields } data = { "type": datatype, "shape": [SNP500_ROWS,] } layout = {"class": 'H5D_CHUNKED_REF', "file_uri": s3path, "dims": chunk_dims, "chunks": chunks } data['creationProperties'] = {'layout': layout} req = self.endpoint + '/datasets' rsp = requests.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) dset_id = rspJson["id"] self.assertTrue(helper.validateId(dset_id)) # link new dataset as 'dset' name = "dset" req = self.endpoint + "/groups/" + root_uuid + "/links/" + name payload = {"id": dset_id} rsp = requests.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # do a point selection req = self.endpoint + "/datasets/" + dset_id + "/value" points = [1234567,] body = { "points": points } rsp = requests.post(req, data=json.dumps(body), headers=headers) if rsp.status_code == 404: print("s3object: {} not found, skipping point chunk ref test".format(s3path)) else: self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) value = rspJson["value"] self.assertEqual(len(value), len(points)) item = value[0] self.assertEqual(item[0], '1998.10.22') self.assertEqual(item[1], 'MHFI') self.assertEqual(item[2], 3)
def testChunkedRefIndirectDataset(self): print("testChunkedRefIndirectDatasetQuery", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) hdf5_sample_bucket = config.get("hdf5_sample_bucket") if not hdf5_sample_bucket: print( "hdf5_sample_bucket config not set, skipping testChunkedRefIndirectDataset" ) return s3path = "s3://" + hdf5_sample_bucket + "/data/hdf5test" + "/snp500.h5" SNP500_ROWS = 3207353 snp500_json = helper.getHDF5JSON("snp500.json") if not snp500_json: print("snp500.json file not found, skipping testChunkedRefDataset") return if "snp500.h5" not in snp500_json: self.assertTrue(False) chunk_dims = [ 60000, ] # chunk layout used in snp500.h5 file num_chunks = (SNP500_ROWS // chunk_dims[0]) + 1 chunk_info = snp500_json["snp500.h5"] dset_info = chunk_info["/dset"] if "byteStreams" not in dset_info: self.assertTrue(False) byteStreams = dset_info["byteStreams"] self.assertEqual(len(byteStreams), num_chunks) chunkinfo_data = [(0, 0)] * num_chunks # fill the numpy array with info from bytestreams data for i in range(num_chunks): item = byteStreams[i] index = item["index"] chunkinfo_data[index] = (item["file_offset"], item["size"]) # get domain req = helper.getEndpoint() + '/' rsp = requests.get(req, headers=headers) rspJson = json.loads(rsp.text) self.assertTrue("root" in rspJson) root_uuid = rspJson["root"] # create table to hold chunkinfo # create a dataset to store chunk info fields = ({ 'name': 'offset', 'type': 'H5T_STD_I64LE' }, { 'name': 'size', 'type': 'H5T_STD_I32LE' }) chunkinfo_type = {'class': 'H5T_COMPOUND', 'fields': fields} req = self.endpoint + "/datasets" # Store 40 chunk locations chunkinfo_dims = [ num_chunks, ] payload = {'type': chunkinfo_type, 'shape': chunkinfo_dims} req = self.endpoint + "/datasets" rsp = requests.post(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset rspJson = json.loads(rsp.text) chunkinfo_uuid = rspJson['id'] self.assertTrue(helper.validateId(chunkinfo_uuid)) # link new dataset as 'chunks' name = "chunks" req = self.endpoint + "/groups/" + root_uuid + "/links/" + name payload = {"id": chunkinfo_uuid} rsp = requests.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # write to the chunkinfo dataset payload = {'value': chunkinfo_data} req = self.endpoint + "/datasets/" + chunkinfo_uuid + "/value" rsp = requests.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 200) # write value # define types we need s10_type = { "charSet": "H5T_CSET_ASCII", "class": "H5T_STRING", "length": 10, "strPad": "H5T_STR_NULLPAD" } s4_type = { "charSet": "H5T_CSET_ASCII", "class": "H5T_STRING", "length": 4, "strPad": "H5T_STR_NULLPAD" } fields = ({ 'name': 'date', 'type': s10_type }, { 'name': 'symbol', 'type': s4_type }, { 'name': 'sector', 'type': 'H5T_STD_I8LE' }, { 'name': 'open', 'type': 'H5T_IEEE_F32LE' }, { 'name': 'high', 'type': 'H5T_IEEE_F32LE' }, { 'name': 'low', 'type': 'H5T_IEEE_F32LE' }, { 'name': 'volume', 'type': 'H5T_IEEE_F32LE' }, { 'name': 'close', 'type': 'H5T_IEEE_F32LE' }) datatype = {'class': 'H5T_COMPOUND', 'fields': fields} data = { "type": datatype, "shape": [ SNP500_ROWS, ] } layout = { "class": 'H5D_CHUNKED_REF_INDIRECT', "file_uri": s3path, "dims": chunk_dims, "chunk_table": chunkinfo_uuid } data['creationProperties'] = {'layout': layout} req = self.endpoint + '/datasets' rsp = requests.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) dset_id = rspJson["id"] self.assertTrue(helper.validateId(dset_id)) # link new dataset as 'dset' name = "dset" req = self.endpoint + "/groups/" + root_uuid + "/links/" + name payload = {"id": dset_id} rsp = requests.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # read a selection req = self.endpoint + "/datasets/" + dset_id + "/value" params = {'query': "symbol == b'AAPL'"} # query for AAPL #params = {'query': "symbol == b'CVX'" } # query for CVX #params["select"] = "[0:100]" params["nonstrict"] = 1 # enable SN to invoke lambda func rsp = requests.get(req, params=params, headers=headers) if rsp.status_code == 404: print( "s3object: {} not found, skipping hyperslab read chunk reference indirect test" .format(s3path)) return self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) #self.assertTrue("hrefs" in rspJson) self.assertTrue("value" in rspJson) self.assertTrue("index" in rspJson) readData = rspJson["value"] self.assertEqual(len(readData), 8813) item = readData[0] self.assertEqual(item[0], "1980.12.12") self.assertEqual(item[1], "AAPL") indices = rspJson["index"] self.assertEqual(len(indices), 8813) self.assertEqual(indices[0], 128912)
def testPostContiguousDataset(self): print("testPostContiguousDataset", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) hdf5_sample_bucket = config.get("hdf5_sample_bucket") if not hdf5_sample_bucket: print("hdf5_sample_bucket config not set, skipping testPostContiguousDataset") return tall_json = helper.getHDF5JSON("tall.json") if not tall_json: print("tall.json file not found, skipping testPostContiguousDataset") return if "tall.h5" not in tall_json: self.assertTrue(False) chunk_info = tall_json["tall.h5"] if "/g1/g1.1/dset1.1.2" not in chunk_info: self.assertTrue(False) dset112_info = chunk_info["/g1/g1.1/dset1.1.2"] if "byteStreams" not in dset112_info: self.assertTrue(False) byteStreams = dset112_info["byteStreams"] # should be just one element for this contiguous dataset self.assertTrue(len(byteStreams), 1) byteStream = byteStreams[0] dset112_offset = byteStream["file_offset"] dset112_size = byteStream["size"] self.assertEqual(dset112_size, 80) if "/g2/dset2.2" not in chunk_info: self.assertTrue(False) dset22_info = chunk_info["/g2/dset2.2"] if "byteStreams" not in dset22_info: self.assertTrue(False) byteStreams = dset22_info["byteStreams"] self.assertTrue(len(byteStreams), 1) byteStream = byteStreams[0] dset22_offset = byteStream["file_offset"] dset22_size = byteStream["size"] self.assertEqual(dset22_size, 60) # get domain req = helper.getEndpoint() + '/' rsp = requests.get(req, headers=headers) rspJson = json.loads(rsp.text) self.assertTrue("root" in rspJson) root_uuid = rspJson["root"] # create dataset fodr /g1/g1.1/dset1.1.2 s3path = "s3://" + hdf5_sample_bucket + "/data/hdf5test" + "/tall.h5" data = { "type": 'H5T_STD_I32BE', "shape": 20 } layout = {"class": 'H5D_CONTIGUOUS_REF', "file_uri": s3path, "offset": dset112_offset, "size": dset112_size } data['creationProperties'] = {'layout': layout} req = self.endpoint + '/datasets' rsp = requests.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) dset112_id = rspJson["id"] self.assertTrue(helper.validateId(dset112_id)) # link new dataset as 'dset112' name = "dset112" req = self.endpoint + "/groups/" + root_uuid + "/links/" + name payload = {"id": dset112_id} rsp = requests.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # create dataset for /g2/dset2.2 data = { "type": 'H5T_IEEE_F32BE', "shape": [3, 5] } layout = {"class": 'H5D_CONTIGUOUS_REF', "file_uri": s3path, "offset": dset22_offset, "size": dset22_size } data['creationProperties'] = {'layout': layout} req = self.endpoint + '/datasets' rsp = requests.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) dset22_id = rspJson["id"] self.assertTrue(helper.validateId(dset22_id)) # link new dataset as 'dset22' name = "dset22" req = self.endpoint + "/groups/" + root_uuid + "/links/" + name payload = {"id": dset22_id} rsp = requests.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) # do a point selection read on dset22 req = self.endpoint + "/datasets/" + dset112_id + "/value" points = [2,3,5,7,11,13,17,19] body = { "points": points } rsp = requests.post(req, data=json.dumps(body), headers=headers) if rsp.status_code == 404: print("s3object: {} not found, skipping point read chunk reference contiguous test".format(s3path)) return self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) ret_value = rspJson["value"] self.assertEqual(len(ret_value), len(points)) self.assertEqual(ret_value, points) # get back the points since the dataset in the range 0-20 # do a point selection read on dset22 req = self.endpoint + "/datasets/" + dset22_id + "/value" points = [(0,0), (1,1), (2,2)] body = { "points": points } rsp = requests.post(req, data=json.dumps(body), headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) ret_value = rspJson["value"] self.assertEqual(len(ret_value), len(points))