Exemple #1
0
def stack(_input,
          output,
          tile_x_size,
          tile_y_size,
          config=None,
          attrs=None,
          bbox=None):
    with rasterio.open(_input) as src:
        profile = src.profile
        trans = Affine.to_gdal(src.transform)
        dt = np.dtype(src.dtypes[0])  # read first band data type

    # read initial image metadata
    profile['driver'] = 'TileDB'
    profile['blockxsize'] = tile_x_size
    profile['blockysize'] = tile_y_size
    if 'tiled' in profile:
        del profile['tiled']

    arr = xr.open_rasterio(_input, chunks={'x': tile_x_size, 'y': tile_y_size})

    if bbox is None:
        w = profile['width']
        h = profile['height']
        bbox = (0, 0, w, h)
    else:
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]

    nBlocksX = math.ceil(w / (tile_x_size * 1.0))
    nBlocksY = math.ceil(h / (tile_y_size * 1.0))

    # GDAL TileDB driver writes/reads blocks so bypass rasterio
    dom = tiledb.Domain(
        tiledb.Dim(name='BANDS', domain=(0, profile['count'] - 1), tile=1),
        tiledb.Dim(name='Y',
                   domain=(0, (nBlocksY * tile_y_size) - 1),
                   tile=tile_y_size,
                   dtype=np.uint64),
        tiledb.Dim(name='X',
                   domain=(0, (nBlocksX * tile_x_size) - 1),
                   tile=tile_x_size,
                   dtype=np.uint64))

    cfg = tiledb.Config(config)
    ctx = tiledb.Ctx(config=cfg)
    schema = tiledb.ArraySchema(
        domain=dom,
        sparse=False,
        attrs=[tiledb.Attr(name="TDB_VALUES", dtype=dt)],
        ctx=ctx)

    tiledb.DenseArray.create(output, schema)
    with tiledb.DenseArray(output, 'w', ctx=ctx) as arr_output:
        arr[:, bbox[0]:bbox[2],
            bbox[1]:bbox[3]].data.to_tiledb(arr_output, storage_options=config)

    # write the GDAL metadata file from the source profile
    vfs = tiledb.VFS()
    meta = f"{output}/{os.path.basename(output)}.tdb.aux.xml"
    try:
        f = vfs.open(meta, "w")
        root = ET.Element('PAMDataset')
        geo = ET.SubElement(root, 'GeoTransform')
        geo.text = ', '.join(map(str, trans))
        meta = ET.SubElement(root, 'Metadata')
        meta.set('domain', 'IMAGE_STRUCTURE')
        t = ET.SubElement(meta, 'MDI')
        t.set('key', 'DATA_TYPE')
        t.text = _gdal_typename(np.complex128)
        nbits = ET.SubElement(meta, 'MDI')
        nbits.set('key', 'NBITS')
        nbits.text = str(dt.itemsize * 8)
        xsize = ET.SubElement(meta, 'MDI')
        xsize.set('key', 'X_SIZE')
        xsize.text = str(w)
        ysize = ET.SubElement(meta, 'MDI')
        ysize.set('key', 'Y_SIZE')
        ysize.text = str(h)
        vfs.write(f, ET.tostring(root))
    finally:
        vfs.close(f)
Exemple #2
0
from multiprocessing.pool import Pool, ThreadPool, get_context
import pdb
import gc

#graceful shutdown
import psutil
import signal
import os
import gc

#config
tdb_Config = tiledb.Config({
    "sm.check_coord_dups": "false",
    "sm.check_coord_oob": "false",
    "sm.check_global_order": "false",
    "sm.num_writer_threads": "50",
    "sm.num_reader_threads": "50",
    "sm.num_async_threads": "50",
    "vfs.num_threads": "50",
    "sm.memory_budget": "5000000000"
})
tdb_Context = tiledb.Ctx(config=tdb_Config)


def init_worker():
    signal.signal(signal.SIGINT, signal.SIG_IGN)


def kill_child_processes(parent_pid, sig=signal.SIGTERM):
    try:
        parent = psutil.Process(parent_pid)
    except psutil.NoSuchProcess:
Exemple #3
0
def config():
    """
    Output TileDB's default configuration parameters and values.
    """
    click.echo(tiledb.Config())
Exemple #4
0
def ingest(args):
    if type(args)==type({}):
        args=args_object_from_args_dict(args)
    if args.write_chunk > max_write_chunk:
        print("WARNING: You have specified a write_chunk size of"+str(args.write_chunk)+" but the maximum supported with python serialization is:"+str(max_write_chunk)+". It will be reset to "+str(max_write_chunk))
        args.write_chunk=max_write_chunk

    #create a queue to write the array
    global write_queue
    write_queue=Queue(maxsize=args.max_queue_size)

    #config
    tdb_Config=tiledb.Config(tdb_config_params)
    tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
    tdb_read_Context=tiledb.Ctx(config=tdb_Config)
    
    overwrite=args.overwrite
    coord_tile_size=args.coord_tile_size
    task_tile_size=args.task_tile_size
    attribute_config=args.attribute_config
    attribute_config_file=args.attribute_config_file
    updating=False

    attribute_info=get_attribute_info(args.attribute_config,args.attribute_config_file)
    tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t')
    num_tasks=tiledb_metadata.shape[0]
    print("num_tasks:"+str(num_tasks))
    
    print("loaded tiledb metadata")
    chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t')
    print("loaded chrom sizes")
    chrom_indices,num_indices=transform_chrom_size_to_indices(chrom_sizes)
    print("num_indices:"+str(num_indices))
    array_out_name=args.array_name
    if tiledb.object_type(array_out_name) == "array":
        if overwrite==False:
            raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting")
        else:
            print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten")
            updating=True
    else:
        #create the array:
        create_new_array(tdb_Context=tdb_write_Context,
                         size=(num_indices,num_tasks-1),
                         attribute_config=attribute_config,
                         attribute_config_file=attribute_config_file,
                         array_out_name=array_out_name,
                         coord_tile_size=coord_tile_size,
                         task_tile_size=task_tile_size,
                         var=False)
        print("created new array:"+str(array_out_name))
        #create metadata array
        metadata_dict={}
        metadata_dict['tasks']=[i for i in tiledb_metadata['dataset']]
        metadata_dict['chroms']=[i for i in chrom_indices.keys()]
        metadata_dict['sizes']=[i[2] for i in list(chrom_indices.values())]
        metadata_dict['offsets']=[i[0] for i in list(chrom_indices.values())]
        num_tasks=tiledb_metadata['dataset'].shape[0]
        
        num_chroms=len(chrom_indices.keys())
        with tiledb.DenseArray(array_out_name,ctx=tdb_write_Context,mode='w') as cur_array:
            cur_array.meta['num_tasks']=num_tasks
            cur_array.meta['num_chroms']=num_chroms
            for task_index in range(num_tasks):
                cur_array.meta['_'.join(['task',str(task_index)])]=metadata_dict['tasks'][task_index]
            for chrom_index in range(num_chroms):
                cur_array.meta['_'.join(['chrom',str(chrom_index)])]=metadata_dict['chroms'][chrom_index]
                cur_array.meta['_'.join(['size',str(chrom_index)])]=metadata_dict['sizes'][chrom_index]
                cur_array.meta['_'.join(['offset',str(chrom_index)])]=metadata_dict['offsets'][chrom_index]                                
        print("created tiledb metadata")
    pool=Pool(processes=args.threads,initializer=init_worker)
    print("made pool") 
    pool_inputs=[] 
    for task_index,task_row in tiledb_metadata.iterrows():
        dataset=task_row['dataset']
        #read in filenames for bigwigs
        data_dict=open_data_for_parsing(task_row,attribute_info)
        for start_chunk_index in range(0,num_indices,args.write_chunk):
            end_chunk_index=start_chunk_index+min([num_indices,args.write_chunk])
            #convert global indices to chrom+pos indices
            chunk_chrom_coords=transform_indices_to_chrom_coords(start_chunk_index,end_chunk_index,chrom_indices)
            if chunk_chrom_coords is None:
                raise Exception("failed to tranform indices:"+str(start_chunk_index)+"-"+str(end_chunk_index)+ " to chrom coords;"+str(chrom_indices))
            for coord_set in chunk_chrom_coords:
                pool_inputs.append((task_index,data_dict,attribute_info,coord_set,args))
    pool_feed_chunk_start=0
    pool_feed_chunk_max=len(pool_inputs)
    chunks_to_process=len(pool_inputs)
    array_writer=Process(target=write_array,args=([args,updating,chunks_to_process]))
    try:
        array_writer.start()
    except Exception as e:
        raise e

    try:
        while pool_feed_chunk_start < pool_feed_chunk_max:
            pool_feed_chunk_end=min([pool_feed_chunk_start+queue_feed_chunk_size,pool_feed_chunk_max])
            #only do mapping if queue size is not exceeded & total memory consumption is not exceeded
            write_queue_size=write_queue.qsize()
            mem_used=psutil.virtual_memory().used / (10**9)
            print("mapping to pool, queue size:"+str(write_queue_size))
            print("mapping to pool, mem used:"+str(mem_used))
            while (write_queue_size >=args.max_queue_size) or (mem_used >=args.max_mem_g):
                time.sleep(10)
            print("sending to pool:"+str(pool_feed_chunk_start)+"-"+str(pool_feed_chunk_end)+"/"+str(chunks_to_process))
            pool.map(process_chunk,pool_inputs[pool_feed_chunk_start:pool_feed_chunk_end])
            pool_feed_chunk_start+=queue_feed_chunk_size
            time.sleep(60)
        pool.close()
    except KeyboardInterrupt:
        kill_child_processes(os.getpid())
        pool.terminate()
        raise
    except Exception as e:
        print(e)
        kill_child_processes(os.getpid())
        raise 
        
    #wait until we're done writing to the tiledb array
    array_writer.join()
    print("array_writer.join() is complete")
    print("shutting down pool")
    pool.join()
    print('done!') 
Exemple #5
0
class TileHelper(object):
    """
    The TileHelper class for convenient tiledb setup
    """
    config = tiledb.Config()
    config["vfs.s3.scheme"] = "https"
    config["vfs.s3.region"] = "us-west-1"
    config["vfs.s3.use_virtual_addressing"] = "true"
    ctx = tiledb.Ctx(config)

    def __init__(self, backend=None, tile_size=1000000, compressor='lz4'):
        if backend == None:
            self.root = os.environ.get('TILEDB_ROOT',
                                       os.path.realpath('./tiledb/'))
            if not os.path.isdir(self.root):
                try:
                    os.makedirs(self.root)
                except:
                    print(
                        f"Warning: TileHelper not able to create {self.root} for backend {backend}"
                    )
        elif backend == 's3':
            self.root = 's3://sirius-tiledb/'
        self.tile_size = tile_size
        if isinstance(compressor, str):
            self.compressor = (compressor, -1)
        elif isinstance(compressor, tuple):
            self.compressor = compressor

    def create_dense_array(self, arrayID, data):
        assert isinstance(data, np.ndarray), "data should be an np.ndarray"
        tile_dims = []
        for i_dim, dim_size in enumerate(data.shape):
            name = f'd{i_dim}'
            tile = min(self.tile_size, dim_size)
            tiledim = tiledb.Dim(self.ctx,
                                 name=name,
                                 domain=(0, dim_size - 1),
                                 tile=tile)
            tile_dims.append(tiledim)
        domain = tiledb.Domain(self.ctx, *tile_dims)
        attr = tiledb.Attr(self.ctx,
                           compressor=self.compressor,
                           dtype=data.dtype)
        schema = tiledb.ArraySchema(self.ctx,
                                    domain=domain,
                                    sparse=False,
                                    attrs=[attr])
        tile_array_id = os.path.join(self.root, arrayID)
        tiledb.DenseArray.create(tile_array_id, schema)
        dense_array = tiledb.DenseArray(self.ctx, tile_array_id, mode='w')
        dense_array[:] = data
        return dense_array

    def load_dense_array(self, arrayID):
        tile_array_id = os.path.join(self.root, arrayID)
        try:
            return tiledb.DenseArray(self.ctx, tile_array_id)
        except tiledb.TileDBError as e:
            print(e)
            return np.array([])

    def remove(self, arrayID):
        tile_array_id = os.path.join(self.root, arrayID)
        tiledb.remove(self.ctx, tile_array_id)

    def ls(self):
        paths = []
        tiledb.ls(self.ctx, self.root, lambda p, l: paths.append(p))
        if self.root.startswith("s3://"):
            results = [p[len(self.root):-1] for p in paths]
        else:
            results = [os.path.basename(p) for p in paths]
        return results
def vacuum_array_metadata(uri):
    """
    Vacuum the already consolidated array metadata in an array located at uri.
    """
    config = tiledb.Config({"sm.vacuum.mode": "array_meta"})
    tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
def vacuum_fragments(uri):
    """
    Vacuum the already consolidated fragments in an array located at uri.
    """
    config = tiledb.Config({"sm.vacuum.mode": "fragments"})
    tiledb.vacuum(uri, ctx=tiledb.Ctx(config))