def getDeflate(app, dset_id, dset_json): deflate_level = getDeflateLevel(dset_json) log.debug("got deflate_level: {}".format(deflate_level)) if deflate_level is not None: deflate_map = app['deflate_map'] if dset_id not in deflate_map: # save the deflate level so the lazy chunk writer can access it deflate_map[dset_id] = deflate_level log.debug("update deflate_map: {}: {}".format(dset_id, deflate_level)) return deflate_level
async def get_chunk(app, chunk_id, dset_json, bucket=None, s3path=None, s3offset=0, s3size=0, chunk_init=False): # if the chunk cache has too many dirty items, wait till items get flushed to S3 MAX_WAIT_TIME = 10.0 # TBD - make this a config chunk_cache = app['chunk_cache'] if chunk_init and s3offset > 0: log.error(f"unable to initiale chunk {chunk_id} for reference layouts ") raise HTTPInternalServerError() log.debug(f"getChunk cache utilization: {chunk_cache.cacheUtilizationPercent} per, dirty_count: {chunk_cache.dirtyCount}, mem_dirty: {chunk_cache.memDirty}") chunk_arr = None dims = getChunkLayout(dset_json) type_json = dset_json["type"] dt = createDataType(type_json) # note - officially we should follow the order in which the filters are defined in the filter_list, # but since we currently have just deflate and shuffle we will always apply deflate then shuffle on read, # and shuffle then deflate on write # also note - get deflate and shuffle will update the deflate and shuffle map so that the s3sync will do the right thing deflate_level = getDeflateLevel(dset_json) shuffle = isShuffle(dset_json) s3key = None if s3path: if not s3path.startswith("s3://"): # TBD - verify these at dataset creation time? log.error(f"unexpected s3path for getChunk: {s3path}") raise HTTPInternalServerError() path = s3path[5:] index = path.find('/') # split bucket and key if index < 1: log.error(f"s3path is invalid: {s3path}") raise HTTPInternalServerError() bucket = path[:index] s3key = path[(index+1):] log.debug(f"Using s3path bucket: {bucket} and s3key: {s3key}") else: s3key = getS3Key(chunk_id) log.debug(f"getChunk chunkid: {chunk_id} bucket: {bucket}") if chunk_id in chunk_cache: chunk_arr = chunk_cache[chunk_id] else: if s3path and s3size == 0: obj_exists = False else: obj_exists = await isStorObj(app, s3key, bucket=bucket) # TBD - potential race condition? if obj_exists: pending_s3_read = app["pending_s3_read"] if chunk_id in pending_s3_read: # already a read in progress, wait for it to complete read_start_time = pending_s3_read[chunk_id] log.info(f"s3 read request for {chunk_id} was requested at: {read_start_time}") while time.time() - read_start_time < 2.0: log.debug("waiting for pending s3 read, sleeping") await asyncio.sleep(1) # sleep for sub-second? if chunk_id in chunk_cache: log.info(f"Chunk {chunk_id} has arrived!") chunk_arr = chunk_cache[chunk_id] break if chunk_arr is None: log.warn(f"s3 read for chunk {chunk_id} timed-out, initiaiting a new read") if chunk_arr is None: if chunk_id not in pending_s3_read: pending_s3_read[chunk_id] = time.time() log.debug(f"Reading chunk {s3key} from S3") chunk_bytes = await getStorBytes(app, s3key, shuffle=shuffle, deflate_level=deflate_level, offset=s3offset, length=s3size, bucket=bucket) if chunk_id in pending_s3_read: # read complete - remove from pending map elapsed_time = time.time() - pending_s3_read[chunk_id] log.info(f"s3 read for {s3key} took {elapsed_time}") del pending_s3_read[chunk_id] else: log.warn(f"expected to find {chunk_id} in pending_s3_read map") chunk_arr = bytesToArray(chunk_bytes, dt, dims) log.debug(f"chunk size: {chunk_arr.size}") elif chunk_init: log.debug(f"Initializing chunk {chunk_id}") fill_value = getFillValue(dset_json) if fill_value: # need to convert list to tuples for numpy broadcast if isinstance(fill_value, list): fill_value = tuple(fill_value) chunk_arr = np.empty(dims, dtype=dt, order='C') chunk_arr[...] = fill_value else: chunk_arr = np.zeros(dims, dtype=dt, order='C') else: log.debug(f"Chunk {chunk_id} not found") if chunk_arr is not None: # check that there's room in the cache before adding it if chunk_cache.memTarget - chunk_cache.memDirty < chunk_arr.size: # no room in the cache, wait till space is freed by the s3sync task wait_start = time.time() while chunk_cache.memTarget - chunk_cache.memDirty < chunk_arr.size: log.warn(f"getChunk, cache utilization: {chunk_cache.cacheUtilizationPercent}, sleeping till items are flushed") if time.time() - wait_start > MAX_WAIT_TIME: log.error(f"unable to save updated chunk {chunk_id} to cache returning 503 error") raise HTTPServiceUnavailable() await asyncio.sleep(1) chunk_cache[chunk_id] = chunk_arr # store in cache return chunk_arr