def test_default_reader(resources_dir, filename, expected_shape, expected_dims, expected_chunksize, expected_task_count): # Get file f = resources_dir / filename # Read file img = DefaultReader(f) # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: assert img.dims == expected_dims assert img.metadata assert img.dask_data.shape == expected_shape assert img.dask_data.chunksize == expected_chunksize # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()] # Check computed type is numpy array, computed shape is expected shape, and task count is expected with Profiler() as prof: assert isinstance(img.data, np.ndarray) assert img.data.shape == expected_shape assert len(prof.results) == expected_task_count # Check that there are no open file pointers after retrieval assert str(f) not in [f.path for f in proc.open_files()]
def test_aicsimage_serialize( resources_dir, tmpdir, filename, expected_shape, expected_metadata_type, expected_task_count, ): """ Test that the entire AICSImage object can be serialized - a requirement to distribute on dask clusters. https://distributed.dask.org/en/latest/serialization.html """ # Get file f = resources_dir / filename # Read file img = AICSImage(f) # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: assert img.shape == expected_shape assert isinstance(img.metadata, expected_metadata_type) # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()] # Serialize object serialized = pickle.dumps(img) # Reload img = pickle.loads(serialized) # Check computed type is numpy array, computed shape is expected shape, and task count is expected with Profiler() as prof: assert isinstance(img.data, np.ndarray) assert img.shape == expected_shape assert img.data.shape == expected_shape assert isinstance(img.metadata, expected_metadata_type) assert len(prof.results) == expected_task_count # Check that there are no open file pointers after retrieval assert str(f) not in [f.path for f in proc.open_files()]
def tpot(use_dask=True): # TODO: Add some documentation... # TODO: Investigate why tpot crashes when uing Dask (probably a RAM problem). if use_dask: client = Client("tcp://192.168.1.94:8786") print(client) tpot_reg = TPOTRegressor(generations=TPOT_GENERATIONS, population_size=TPOT_POPULATION_SIZE, random_state=SEED, cv=CV, use_dask=use_dask, verbosity=2, memory="auto") df = pd.read_csv("elo/data/augmented_train.csv") print(df.sample(5)) # TODO: Find a better way to impute inf and missing values. df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(df.median()) X = df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values y = df.loc[:, "target"].values if use_dask: with ProgressBar() as pbar, Profiler() as prof: tpot_reg.fit(X, y) else: tpot_reg.fit(X, y) export_path = str( Path('elo/data/tpot_few_generations_augmented_dataset.py').absolute()) tpot_reg.export(export_path) return tpot_reg
def test_support_for_ndarray(arr): # Check basics with Profiler() as prof: actual_reader = AICSImage.determine_reader(arr) assert actual_reader == readers.ArrayLikeReader # Check that basic details don't require task computation assert len(prof.results) == 0
def test_ome_tiff_reader(resources_dir, filename, expected_shape, expected_dims, select_scene, expected_chunksize, expected_task_count): # Get file f = resources_dir / filename # Read file img = OmeTiffReader(f, S=select_scene) # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: # Check that OME Metadata matches the dask data array shape and dims order dim_size_getters = { Dimensions.Scene: img.size_s, Dimensions.Time: img.size_t, Dimensions.Channel: img.size_c, Dimensions.SpatialZ: img.size_z, Dimensions.SpatialY: img.size_y, Dimensions.SpatialX: img.size_x } for d, getter in dim_size_getters.items(): if d in expected_dims: assert getter() == img.dask_data.shape[img.dims.index(d)] assert img.dims == expected_dims assert img.is_ome() assert img.metadata assert img.dask_data.shape == expected_shape assert img.dask_data.chunksize == expected_chunksize # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()] # Check computed type is numpy array, computed shape is expected shape, and task count is expected with Profiler() as prof: assert isinstance(img.data, np.ndarray) assert img.data.shape == expected_shape assert len(prof.results) == expected_task_count # Check that there are no open file pointers after retrieval assert str(f) not in [f.path for f in proc.open_files()]
def test_default_shape_expansion(data, expected): # Check basics with Profiler() as prof: img = AICSImage(data=data) assert img.dask_data.shape == expected assert img.shape == expected # Check that basic details don't require task computation assert len(prof.results) == 0
def test_arraylike_reader(arr, expected_shape, expected_dims, expected_chunksize, expected_task_count): # Init reader = ArrayLikeReader(arr) # Check basics with Profiler() as prof: assert reader.dims == expected_dims assert reader.metadata is None assert reader.dask_data.shape == expected_shape assert reader.dask_data.chunksize == expected_chunksize # Check that basic details don't require task computation assert len(prof.results) == 0 # Check computed type is numpy array, computed shape is expected shape, and task count is expected with Profiler() as prof: assert isinstance(reader.data, np.ndarray) assert reader.data.shape == expected_shape assert len(prof.results) == expected_task_count
def main(): global sky global dirty global psf list_schedule = [] list_compute = [] list_total = [] list_load = [] start_time1 = time.time() sky_npy, sky = load_data(os.path.split(os.getcwd())[0] + '/sky.npy') dirty_npy, dirty = load_data(os.path.split(os.getcwd())[0] + '/dirty.npy') psf_npy, psf = load_data(os.path.split(os.getcwd())[0] + '/psf.npy') end_time1 = time.time() start_time2 = time.time() scheduling() end_time2 = time.time() pbar = ProgressBar() with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof: start_time3 = time.time() hub.compute() end_time3 = time.time() #pbar.register() #quad.compute() #pbar.unregister() with PrintKeys(): hub.compute() print("\n" + "Resultats du profilling:") print(prof.results[0]) print("\n" + "La valeur d'usage de la memoire est en MB et l'information du CPU est %d'usage de la CPU") print(rprof.results) print("\n" + "Resultats du profilling de la cache:") print(cprof.results[0]) visualize([prof, rprof, cprof]) list_load.append(end_time1 - start_time1) list_schedule.append(end_time2 - start_time2) list_compute.append(end_time3 - start_time3) list_total.append(end_time3 - start_time1) print("\n" + "Temps du code pous analyse") print('load time: {}'.format(round(sum(list_load)/len(list_load), 4))) print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4))) print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4))) print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))
def finalSort(data): with pd.HDFStore(data + 'final/final.h5') as store: keys = store.keys() with ProgressBar(), Profiler() as prof: with pd.HDFStore(data + 'final/final-sorted.h5', complevel=9, complib='blosc') as outstore: for key in keys: logging.info("Sorting %s" % key) df = dd.read_hdf(data + 'final/final.h5', key) sortdf = df.compute().sort_values('count', ascending=False) outstore.append(key, sortdf) logging.info("Done sorting")
def test_dims_setting(expected_starting_dims, set_dims, expected_ending_dims): # Read file img = ArrayLikeReader(da.ones((2, 2, 2))) # Check basics with Profiler() as prof: assert img.dims == expected_starting_dims # Check that basic details don't require task computation assert len(prof.results) == 0 # Check no tasks happen during dims setting with Profiler() as prof: img.dims = set_dims # Check that basic details don't require task computation assert len(prof.results) == 0 # Check no tasks happen during dims getting with Profiler() as prof: assert img.dims == expected_ending_dims # Check that basic details don't require task computation assert len(prof.results) == 0
def uncompress_to_hdf5(): print('Writing to hdf5 file after loading raw data in RAM.') raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None) print( f'time to save the array to hdf5 without compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path) # write to numpy stack out_filepath = 'data/out.hdf5' os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_commpressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
def test_dims_setting(resources_dir, expected_starting_dims, set_dims, expected_ending_dims): # Get file f = resources_dir / "example.png" # Read file img = DefaultReader(f) # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: assert img.dims == expected_starting_dims # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()] # Check no tasks happen during dims setting with Profiler() as prof: img.dims = set_dims # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()] # Check no tasks happen during dims getting with Profiler() as prof: assert img.dims == expected_ending_dims # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after retrieval assert str(f) not in [f.path for f in proc.open_files()]
def test_num_workers_config(scheduler): # Regression test for issue #4082 f = delayed(pure=False)(time.sleep) # Be generous with the initial sleep times, as process have been observed # to take >0.5s to spin up num_workers = 3 a = [f(1.0) for i in range(num_workers)] with dask.config.set(num_workers=num_workers, chunksize=1), Profiler() as prof: compute(*a, scheduler=scheduler) workers = {i.worker_id for i in prof.results} assert len(workers) == num_workers
def test_num_workers_config(scheduler): pytest.importorskip("cloudpickle") # Regression test for issue #4082 f = delayed(pure=False)(time.sleep) # Be generous with the initial sleep times, as process have been observed # to take >0.5s to spin up a = [f(1.0), f(1.0), f(1.0), f(0.1)] num_workers = 3 with dask.config.set(num_workers=num_workers), Profiler() as prof: compute(*a, scheduler=scheduler) workers = {i.worker_id for i in prof.results} assert len(workers) == num_workers
def onthefly_to_nps(): print('Writing to npy stack file without loading raw data in RAM.') out_dir = 'data/out_3_numpy' out_file_path = "outputs/write_npy_stack.html" # write to numpy stack with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() write_to_npy_stack(out_dir, arr) print(f'time to save the array to numpy stack: {time.time() - t}') visualize([prof, rprof, cprof], out_file_path)
def test_known_dims(data, dims, expected_shape): # Check basics with Profiler() as prof: img = AICSImage(data, known_dims=dims) assert img.data.shape == expected_shape assert img.size_x == expected_shape[5] assert img.size_y == expected_shape[4] assert img.size_z == expected_shape[3] assert img.size_c == expected_shape[2] assert img.size_t == expected_shape[1] assert img.size_s == expected_shape[0] assert img.size(dims) == data.shape # Due to reshape and transpose there will be 2 tasks in the graph assert len(prof.results) == 2
def test_num_workers_config(scheduler): # Regression test for issue #4082 @delayed def f(x): time.sleep(0.5) return x a = [f(i) for i in range(5)] num_workers = 3 with dask.config.set(num_workers=num_workers), Profiler() as prof: a = compute(*a, scheduler=scheduler) workers = {i.worker_id for i in prof.results} assert len(workers) == num_workers
def test_typing(filename, expected_reader, resources_dir): # Get filepath f = resources_dir / filename # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: actual_reader = AICSImage.determine_reader(f) assert actual_reader == expected_reader # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_file_passed_was_directory(resources_dir): # Get filepath f = resources_dir # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: with pytest.raises(IsADirectoryError): AICSImage(resources_dir) # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_large_imread_dask(resources_dir, filename, expected_shape, expected_task_count): # Get filepath f = resources_dir / filename # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: img = imread_dask(f) assert img.shape == expected_shape assert len(prof.results) == expected_task_count # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def execute(self, wf, client): if not wf.processes: return {} dsk = wf.convertGraph() with Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: result = client.get(dsk[0], dsk[1]) msg.logMessage('result:', result, level=msg.DEBUG) path = user_config_dir('xicam/profile.html') visualize([prof, rprof, cprof], show=False, file_path=path) msg.logMessage(f'Profile saved: {path}') wf.lastresult = result return result
def test_physical_pixel_size(resources_dir, filename, expected_sizes): # Get filepath f = resources_dir / filename # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: img = AICSImage(f) assert img.get_physical_pixel_size() == expected_sizes # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_imread(resources_dir, filename, expected_shape): # Get filepath f = resources_dir / filename # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: img = imread(f) assert img.shape == expected_shape # Reshape and transpose are required so there should be two tasks in the graph assert len(prof.results) == 2 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_metadata(resources_dir, filename, expected_metadata_type): # Get filepath f = resources_dir / filename # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check basics with Profiler() as prof: img = AICSImage(f) assert isinstance(img.metadata, expected_metadata_type) # Check that basic details don't require task computation assert len(prof.results) == 0 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_force_dims(data_shape, dims, expected): # Check basics with Profiler() as prof: img = AICSImage(data=da.zeros(data_shape)) img._reader._dims = dims assert img.data.shape == expected assert data_shape == img.get_image_data(out_orientation=dims).shape assert img.size_x == expected[5] assert img.size_y == expected[4] assert img.size_z == expected[3] assert img.size_c == expected[2] assert img.size_t == expected[1] assert img.size_s == expected[0] assert img.size(dims) == data_shape # Two operations are happening # First, img.data is called and so two tasks of reshape and transpose are ran # Then get_image_data is ran and two more reshape and transpose are ran assert len(prof.results) == 4
def uncompress_to_npy(): print('Writing to numpy file after loading raw data in RAM.') out_filepath = 'data/out_1.npy' diagnostics_filepath = "outputs/load_raw_write_npy_file.html" raw_arr = uncompress() # write to numpy file if os.path.isfile(out_filepath): os.remove(out_filepath) with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() np.save(out_filepath, raw_arr) print(f'time to save the array to numpy file: {time.time() - t}') visualize([prof, rprof, cprof], diagnostics_filepath)
def _execute_graph(self, *writes): # Set up Profilers and Progress Bars with ExitStack() as stack: profilers = [] if can_profile: from dask.diagnostics import (Profiler, CacheProfiler, ResourceProfiler, visualize) profilers.append(stack.enter_context(Profiler())) profilers.append(stack.enter_context(CacheProfiler())) profilers.append(stack.enter_context(ResourceProfiler())) if sys.stdout.isatty() and not self.args.boring: from dask.diagnostics import ProgressBar stack.enter_context(ProgressBar()) dask.compute(*writes, scheduler='single-threaded') logger.info("Averaging Complete") if can_profile: visualize(profilers)
def uncompress_to_nps(): print('Writing to numpy stack after loading raw data in RAM.') # load data in RAM raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_dir = 'data/out_numpy' out_file_path = "outputs/load_raw_write_npy_stack.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() write_to_npy_stack(out_dir, arr) print(f'time to save the array to numpy stack: {time.time() - t}') visualize([prof, rprof, cprof], out_file_path)
def test_daread(data_dir, img, expected_shape, expected_chunksize, expected_dims): # Read the data into a dask array data, dims = daread(data_dir / img) # Do basic checking of shape, chunksize, and dims assert data.shape == expected_shape assert data.chunksize == expected_chunksize assert dims == expected_dims # Check that when a single plane is selected, only two tasks run getitem_ops = [] for dim in dims: if dim not in ["Y", "X"]: getitem_ops.append(0) else: getitem_ops.append(slice(None, None, None)) # Run through profiler with Profiler() as prof: assert isinstance(data[tuple(getitem_ops)].compute(), np.ndarray) assert len(prof.results) == 2
def onthefly_to_hdf5(): print('Writing to hdf5 file without loading raw data in RAM.') # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/write_hdf5.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=(1400, 1400, 350), compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)