def test_attr_filters(self, runner, temp_rootdir, create_test_simple_csv): """ Test for command tiledb convert_from [csv_file] [uri] --attr-filters <filter name>,<filter name>,... """ test_name, _ = create_test_simple_csv input_path = os.path.join(temp_rootdir, f"{test_name}.csv") uri = os.path.join(temp_rootdir, "test_attr_filters.tdb") result = runner.invoke( root, [ "convert-from", "csv", input_path, uri, "--attr-filters", "GzipFilter=9" ], ) print(result.stdout) assert result.exit_code == 0 with tiledb.open(uri) as array: assert array.schema.attr("a").filters.nfilters == 1 assert array.schema.attr("a").filters[0] == tiledb.GzipFilter(9) assert array.schema.attr("b").filters.nfilters == 1 assert array.schema.attr("b").filters[0] == tiledb.GzipFilter(9) assert array.schema.attr("c").filters.nfilters == 1 assert array.schema.attr("c").filters[0] == tiledb.GzipFilter(9) assert array.schema.attr("date").filters.nfilters == 1 assert array.schema.attr("date").filters[0] == tiledb.GzipFilter(9)
def create_new_array(size, array_out_name, tile_size, attribute_config, compressor='gzip', compression_level=-1): ''' Creates an empty tileDB array ''' tile_size = min(size, tile_size) tiledb_dim = tiledb.Dim(name='genome_coordinate', domain=(0, size - 1), tile=tile_size, dtype='uint32') tiledb_dom = tiledb.Domain(tiledb_dim, ctx=tdb_Context) #generate the attribute information attribute_info = get_attribute_info(attribute_config) attribs = [] for key in attribute_info: attribs.append( tiledb.Attr(name=key, filters=tiledb.FilterList([tiledb.GzipFilter()]), dtype=attribute_info[key]['dtype'])) tiledb_schema = tiledb.ArraySchema(domain=tiledb_dom, attrs=tuple(attribs), cell_order='row-major', tile_order='row-major') tiledb.DenseArray.create(array_out_name, tiledb_schema, ctx=tdb_Context) print("created empty array on disk") gc.collect() return
def create_datahealtharray(self, uri): if uri.endswith("DAILY_METRICS"): dimension = tiledb.Dim(name='date', domain=(np.datetime64('1900-01-01'), np.datetime64('2262-01-01')), tile=np.timedelta64(365, 'ns'), dtype=np.datetime64('', 'ns').dtype) arraySchema = tiledb.ArraySchema( domain=tiledb.Domain(dimension), attrs=[ tiledb.Attr(name='midclose', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='logret', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='logret_ema', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), ], cell_order='row-major', tile_order='row-major', capacity=10000, sparse=True, allows_duplicates=False, coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)], chunksize=512000), offsets_filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)) tiledb.SparseArray.create(uri, arraySchema)
def store_df(self, datatype, name, df, sparse=True, data_df=True): uri = self.get_uri(datatype, name) array_existed = tiledb.highlevel.array_exists(uri) if not array_existed and data_df: if datatype == self._RAW_DATA: self.create_dataarray(uri) elif datatype == self._HEALTH_DATA: self.create_datahealtharray(uri) array_existed = True tiledb.from_pandas( uri, df, sparse=sparse, mode='append' if array_existed else 'ingest', tile_order='row_major', cell_order='row_major', attrs_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)], chunksize=512000), coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)], chunksize=512000))
def create_new_array(tdb_Context, size, array_out_name, coord_tile_size, task_tile_size, attribute_config, attribute_config_file, compressor='gzip', compression_level=-1, var=False): ''' Creates an empty tileDB array size= tuple(num_indices,num_tasks) ''' coord_tile_size=min(size[0],coord_tile_size) task_tile_size=max([1,min(size[1],task_tile_size)]) tiledb_dim_coords = tiledb.Dim( name='genome_coordinate', domain=(0, size[0]), tile=coord_tile_size, dtype='uint32') tiledb_dim_tasks=tiledb.Dim( name='task', domain=(0,size[1]),#max([1,size[1]])), tile=task_tile_size, dtype='uint32') tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context) #generate the attribute information attribute_info=get_attribute_info(attribute_config,attribute_config_file) attribs=[] for key in attribute_info: attribs.append(tiledb.Attr( name=key, var=var, filters=tiledb.FilterList([tiledb.GzipFilter()]), dtype=attribute_info[key]['dtype'])) tiledb_schema = tiledb.ArraySchema( domain=tiledb_dom, attrs=tuple(attribs), cell_order='row-major', tile_order='row-major') tiledb.DenseArray.create(array_out_name, tiledb_schema) print("created empty array on disk") return
def create_dataarray(self, uri): dimension = tiledb.Dim(name='date', domain=(np.datetime64('1900-01-01'), np.datetime64('2262-01-01')), tile=np.timedelta64(365, 'ns'), dtype=np.datetime64('', 'ns').dtype) domain = tiledb.Domain(dimension) attrs = [ tiledb.Attr(name='bidopen', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='bidclose', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='bidhigh', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='bidlow', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='askopen', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='askclose', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='askhigh', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='asklow', dtype='float64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), tiledb.Attr(name='tickqty', dtype='int64', filters=tiledb.FilterList( [tiledb.GzipFilter(level=-1)], chunksize=512000)), ] arraySchema = tiledb.ArraySchema( domain=domain, attrs=attrs, cell_order='row-major', tile_order='row-major', capacity=10000, sparse=True, allows_duplicates=False, coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)], chunksize=512000), offsets_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)], chunksize=512000)) tiledb.SparseArray.create(uri, arraySchema)