def test_attr_filters(self, runner, temp_rootdir, create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --attr-filters <filter name>,<filter name>,...
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_attr_filters.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from", "csv", input_path, uri, "--attr-filters",
                "GzipFilter=9"
            ],
        )

        print(result.stdout)
        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.schema.attr("a").filters.nfilters == 1
            assert array.schema.attr("a").filters[0] == tiledb.GzipFilter(9)

            assert array.schema.attr("b").filters.nfilters == 1
            assert array.schema.attr("b").filters[0] == tiledb.GzipFilter(9)

            assert array.schema.attr("c").filters.nfilters == 1
            assert array.schema.attr("c").filters[0] == tiledb.GzipFilter(9)

            assert array.schema.attr("date").filters.nfilters == 1
            assert array.schema.attr("date").filters[0] == tiledb.GzipFilter(9)
Beispiel #2
0
def create_new_array(size,
                     array_out_name,
                     tile_size,
                     attribute_config,
                     compressor='gzip',
                     compression_level=-1):
    '''
    Creates an empty tileDB array
    '''

    tile_size = min(size, tile_size)
    tiledb_dim = tiledb.Dim(name='genome_coordinate',
                            domain=(0, size - 1),
                            tile=tile_size,
                            dtype='uint32')
    tiledb_dom = tiledb.Domain(tiledb_dim, ctx=tdb_Context)

    #generate the attribute information
    attribute_info = get_attribute_info(attribute_config)
    attribs = []
    for key in attribute_info:
        attribs.append(
            tiledb.Attr(name=key,
                        filters=tiledb.FilterList([tiledb.GzipFilter()]),
                        dtype=attribute_info[key]['dtype']))
    tiledb_schema = tiledb.ArraySchema(domain=tiledb_dom,
                                       attrs=tuple(attribs),
                                       cell_order='row-major',
                                       tile_order='row-major')

    tiledb.DenseArray.create(array_out_name, tiledb_schema, ctx=tdb_Context)
    print("created empty array on disk")
    gc.collect()
    return
Beispiel #3
0
    def create_datahealtharray(self, uri):
        if uri.endswith("DAILY_METRICS"):
            dimension = tiledb.Dim(name='date',
                                   domain=(np.datetime64('1900-01-01'),
                                           np.datetime64('2262-01-01')),
                                   tile=np.timedelta64(365, 'ns'),
                                   dtype=np.datetime64('', 'ns').dtype)

            arraySchema = tiledb.ArraySchema(
                domain=tiledb.Domain(dimension),
                attrs=[
                    tiledb.Attr(name='midclose',
                                dtype='float64',
                                filters=tiledb.FilterList(
                                    [tiledb.GzipFilter(level=-1)],
                                    chunksize=512000)),
                    tiledb.Attr(name='logret',
                                dtype='float64',
                                filters=tiledb.FilterList(
                                    [tiledb.GzipFilter(level=-1)],
                                    chunksize=512000)),
                    tiledb.Attr(name='logret_ema',
                                dtype='float64',
                                filters=tiledb.FilterList(
                                    [tiledb.GzipFilter(level=-1)],
                                    chunksize=512000)),
                ],
                cell_order='row-major',
                tile_order='row-major',
                capacity=10000,
                sparse=True,
                allows_duplicates=False,
                coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                                 chunksize=512000),
                offsets_filters=tiledb.FilterList(
                    [tiledb.GzipFilter(level=-1)], chunksize=512000))

            tiledb.SparseArray.create(uri, arraySchema)
Beispiel #4
0
    def store_df(self, datatype, name, df, sparse=True, data_df=True):
        uri = self.get_uri(datatype, name)
        array_existed = tiledb.highlevel.array_exists(uri)

        if not array_existed and data_df:
            if datatype == self._RAW_DATA:
                self.create_dataarray(uri)
            elif datatype == self._HEALTH_DATA:
                self.create_datahealtharray(uri)

            array_existed = True

        tiledb.from_pandas(
            uri,
            df,
            sparse=sparse,
            mode='append' if array_existed else 'ingest',
            tile_order='row_major',
            cell_order='row_major',
            attrs_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                            chunksize=512000),
            coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                             chunksize=512000))
Beispiel #5
0
def create_new_array(tdb_Context,
                     size,
                     array_out_name,
                     coord_tile_size,
                     task_tile_size,
                     attribute_config,
                     attribute_config_file,
                     compressor='gzip',
                     compression_level=-1,
                     var=False):
    '''
    Creates an empty tileDB array
    size= tuple(num_indices,num_tasks)
    '''
    coord_tile_size=min(size[0],coord_tile_size)
    task_tile_size=max([1,min(size[1],task_tile_size)])
    tiledb_dim_coords = tiledb.Dim(
        name='genome_coordinate',
        domain=(0, size[0]),
        tile=coord_tile_size,
        dtype='uint32')
    tiledb_dim_tasks=tiledb.Dim(
        name='task',
        domain=(0,size[1]),#max([1,size[1]])),
        tile=task_tile_size,
        dtype='uint32')
    tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context)

    #generate the attribute information
    attribute_info=get_attribute_info(attribute_config,attribute_config_file)
    attribs=[]
    for key in attribute_info:
        attribs.append(tiledb.Attr(
            name=key,
            var=var,
            filters=tiledb.FilterList([tiledb.GzipFilter()]),
            dtype=attribute_info[key]['dtype']))
    
    tiledb_schema = tiledb.ArraySchema(
        domain=tiledb_dom,
        attrs=tuple(attribs),
        cell_order='row-major',
        tile_order='row-major')
    
    tiledb.DenseArray.create(array_out_name, tiledb_schema)
    print("created empty array on disk")
    return
Beispiel #6
0
    def create_dataarray(self, uri):
        dimension = tiledb.Dim(name='date',
                               domain=(np.datetime64('1900-01-01'),
                                       np.datetime64('2262-01-01')),
                               tile=np.timedelta64(365, 'ns'),
                               dtype=np.datetime64('', 'ns').dtype)

        domain = tiledb.Domain(dimension)

        attrs = [
            tiledb.Attr(name='bidopen',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='bidclose',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='bidhigh',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='bidlow',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='askopen',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='askclose',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='askhigh',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='asklow',
                        dtype='float64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
            tiledb.Attr(name='tickqty',
                        dtype='int64',
                        filters=tiledb.FilterList(
                            [tiledb.GzipFilter(level=-1)], chunksize=512000)),
        ]

        arraySchema = tiledb.ArraySchema(
            domain=domain,
            attrs=attrs,
            cell_order='row-major',
            tile_order='row-major',
            capacity=10000,
            sparse=True,
            allows_duplicates=False,
            coords_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                             chunksize=512000),
            offsets_filters=tiledb.FilterList([tiledb.GzipFilter(level=-1)],
                                              chunksize=512000))

        tiledb.SparseArray.create(uri, arraySchema)