def test_cache_profiler_plot(): with CacheProfiler(metric_name="non-standard") as cprof: get(dsk, "e") p = cprof.visualize( width=500, height=300, tools="hover", title="Not the default", show=False, save=False, ) if BOKEH_VERSION().major < 3: assert p.plot_width == 500 assert p.plot_height == 300 else: assert p.width == 500 assert p.height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title.text == "Not the default" assert p.axis[1].axis_label == "Cache Size (non-standard)" # Test empty, checking for errors cprof.clear() with warnings.catch_warnings(record=True) as record: cprof.visualize(show=False, save=False) assert not record
def main(): global sky global dirty global psf list_schedule = [] list_compute = [] list_total = [] list_load = [] start_time1 = time.time() sky_npy, sky = load_data(os.path.split(os.getcwd())[0] + '/sky.npy') dirty_npy, dirty = load_data(os.path.split(os.getcwd())[0] + '/dirty.npy') psf_npy, psf = load_data(os.path.split(os.getcwd())[0] + '/psf.npy') end_time1 = time.time() start_time2 = time.time() scheduling() end_time2 = time.time() pbar = ProgressBar() with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof: start_time3 = time.time() hub.compute() end_time3 = time.time() #pbar.register() #quad.compute() #pbar.unregister() with PrintKeys(): hub.compute() print("\n" + "Resultats du profilling:") print(prof.results[0]) print("\n" + "La valeur d'usage de la memoire est en MB et l'information du CPU est %d'usage de la CPU") print(rprof.results) print("\n" + "Resultats du profilling de la cache:") print(cprof.results[0]) visualize([prof, rprof, cprof]) list_load.append(end_time1 - start_time1) list_schedule.append(end_time2 - start_time2) list_compute.append(end_time3 - start_time3) list_total.append(end_time3 - start_time1) print("\n" + "Temps du code pous analyse") print('load time: {}'.format(round(sum(list_load)/len(list_load), 4))) print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4))) print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4))) print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))
def test_cache_profiler(): with CacheProfiler() as cprof: out = get(dsk2, 'c') results = cprof.results assert all(isinstance(i, tuple) and len(i) == 5 for i in results) cprof.clear() assert cprof.results == [] tics = [0] def nbytes(res): tics[0] += 1 return tics[0] with CacheProfiler(nbytes) as cprof: out = get(dsk2, 'c') results = cprof.results assert tics[-1] == len(results) assert tics[-1] == results[-1].metric assert cprof._metric_name == 'nbytes' assert CacheProfiler(metric=nbytes, metric_name='foo')._metric_name == 'foo'
def uncompress_to_hdf5(): print('Writing to hdf5 file after loading raw data in RAM.') raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None) print( f'time to save the array to hdf5 without compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path) # write to numpy stack out_filepath = 'data/out.hdf5' os.remove(out_filepath) out_file_path = "outputs/load_raw_write_hdf5_commpressed.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
def onthefly_to_nps(): print('Writing to npy stack file without loading raw data in RAM.') out_dir = 'data/out_3_numpy' out_file_path = "outputs/write_npy_stack.html" # write to numpy stack with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() write_to_npy_stack(out_dir, arr) print(f'time to save the array to numpy stack: {time.time() - t}') visualize([prof, rprof, cprof], out_file_path)
def test_cache_profiler_plot(): with CacheProfiler(metric_name='non-standard') as cprof: get(dsk, 'e') p = cprof.visualize(plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert check_title(p, "Not the default") assert p.axis[1].axis_label == 'Cache Size (non-standard)' # Test empty, checking for errors cprof.clear() cprof.visualize(show=False, save=False)
def execute(self, wf, client): if not wf.processes: return {} dsk = wf.convertGraph() with Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: result = client.get(dsk[0], dsk[1]) msg.logMessage('result:', result, level=msg.DEBUG) path = user_config_dir('xicam/profile.html') visualize([prof, rprof, cprof], show=False, file_path=path) msg.logMessage(f'Profile saved: {path}') wf.lastresult = result return result
def uncompress_to_npy(): print('Writing to numpy file after loading raw data in RAM.') out_filepath = 'data/out_1.npy' diagnostics_filepath = "outputs/load_raw_write_npy_file.html" raw_arr = uncompress() # write to numpy file if os.path.isfile(out_filepath): os.remove(out_filepath) with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() np.save(out_filepath, raw_arr) print(f'time to save the array to numpy file: {time.time() - t}') visualize([prof, rprof, cprof], diagnostics_filepath)
def _execute_graph(self, *writes): # Set up Profilers and Progress Bars with ExitStack() as stack: profilers = [] if can_profile: from dask.diagnostics import (Profiler, CacheProfiler, ResourceProfiler, visualize) profilers.append(stack.enter_context(Profiler())) profilers.append(stack.enter_context(CacheProfiler())) profilers.append(stack.enter_context(ResourceProfiler())) if sys.stdout.isatty() and not self.args.boring: from dask.diagnostics import ProgressBar stack.enter_context(ProgressBar()) dask.compute(*writes, scheduler='single-threaded') logger.info("Averaging Complete") if can_profile: visualize(profilers)
def uncompress_to_nps(): print('Writing to numpy stack after loading raw data in RAM.') # load data in RAM raw_arr = uncompress() # create dask array from data in RAM arr = da.from_array(raw_arr, chunks=(1400, 1400, 350)) # write to numpy stack out_dir = 'data/out_numpy' out_file_path = "outputs/load_raw_write_npy_stack.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() write_to_npy_stack(out_dir, arr) print(f'time to save the array to numpy stack: {time.time() - t}') visualize([prof, rprof, cprof], out_file_path)
def test_cache_profiler_plot(): with CacheProfiler(metric_name="non-standard") as cprof: get(dsk, "e") p = cprof.visualize( plot_width=500, plot_height=300, tools="hover", title="Not the default", show=False, save=False, ) assert p.plot_width == 500 assert p.plot_height == 300 assert len(p.tools) == 1 assert isinstance(p.tools[0], bokeh.models.HoverTool) assert p.title.text == "Not the default" assert p.axis[1].axis_label == "Cache Size (non-standard)" # Test empty, checking for errors cprof.clear() with pytest.warns(None) as record: cprof.visualize(show=False, save=False) assert len(record) == 0
def onthefly_to_hdf5(): print('Writing to hdf5 file without loading raw data in RAM.') # write to numpy stack out_filepath = 'data/out.hdf5' if os.path.isfile(out_filepath): os.remove(out_filepath) out_file_path = "outputs/write_hdf5.html" with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler( metric=nbytes) as cprof: t = time.time() da.to_hdf5(out_filepath, 'data', arr, chunks=(1400, 1400, 350), compression="gzip") print( f'time to save the array to hdf5 with compression: {time.time() - t}' ) visualize([prof, rprof, cprof], out_file_path)
dsk = {} files = sorted(glob.glob("{0}/*.tif".format(data_path))) final_saves = [] for filename in files: filename_cleaned = filename.split("/")[-1].split(".")[0] dsk['threshold-{0}'.format(filename_cleaned)] = (threshold, filename) dsk['min_size-{0}'.format(filename_cleaned)] = ( min_size, 'threshold-{0}'.format(filename_cleaned)) dsk['clean-{0}'.format(filename_cleaned)] = ( clean, 'min_size-{0}'.format(filename_cleaned)) dsk['reveal-{0}'.format(filename_cleaned)] = ( reveal, 'clean-{0}'.format(filename_cleaned)) dsk['pearlite-{0}'.format(filename_cleaned)] = ( pearlite, 'reveal-{0}'.format(filename_cleaned)) dsk['ferrite-{0}'.format(filename_cleaned)] = ( ferrite, 'pearlite-{0}'.format(filename_cleaned)) dsk['cemmentite-{0}'.format(filename_cleaned)] = ( cemmentite, 'ferrite-{0}'.format(filename_cleaned)) dsk['save-{0}'.format(filename_cleaned)] = ( save, 'cemmentite-{0}'.format(filename_cleaned)) final_saves.append('save-{0}'.format(filename_cleaned)) dsk['finalize'] = (finalize, final_saves) dot_graph(dsk) with ResourceProfiler(0.25) as rprof, Profiler() as prof, CacheProfiler( ) as cprof, ProgressBar(): dak_get(dsk, 'finalize') visualize([prof, rprof, cprof])
sys.path.append('/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan') from dmagellan.feature.extractfeatures import extract_feature_vecs from dmagellan.feature.autofeaturegen import get_features_for_matching from dask import multiprocessing, threaded from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize import cloudpickle filename='./profres_exp_mt_dblp_300k_extractfeatvecs.html' pbar = ProgressBar() pbar.register() #print("Mem. usage before reading:{0}".format( psutil.virtual_memory().used/1e9)) A = pd.read_csv('./datasets/sample_citeseer_300k.csv') B = pd.read_csv('./datasets/sample_dblp_300k.csv') #print("Mem. usage after reading:{0}".format(psutil.virtual_memory().used/1e9)) C = pd.read_csv('./datasets/candset.csv') feature_table = get_features_for_matching(A, B) feature_vecs = extract_feature_vecs(C, A, B, '_id', 'l_id', 'r_id', 'id', 'id', feature_table=feature_table, nchunks=4, compute=False) with Profiler() as prof, CacheProfiler() as cprof, ResourceProfiler(dt=0.25) as rprof: D = feature_vecs.compute(get=threaded.get, num_workers=4) visualize([prof, cprof, rprof], file_path=filename, show=False)
def main(cfgfile, starttime=None, endtime=None, trajfile="", trajtype='plane', flashnr=0, infostr="", MULTIPROCESSING_DSET=False, MULTIPROCESSING_PROD=False, PROFILE_MULTIPROCESSING=False): """ Main flow control. Processes radar data off-line over a period of time given either by the user, a trajectory file, or determined by the last volume processed and the current time. Multiple radars can be processed simultaneously Parameters ---------- cfgfile : str path of the main config file starttime, endtime : datetime object start and end time of the data to be processed trajfile : str path to file describing the trajectory trajtype : str type of trajectory file. Can be either 'plane' or 'lightning' flashnr : int If larger than 0 will select a flash in a lightning trajectory file. If 0 the data corresponding to the trajectory of all flashes will be plotted infostr : str Information string about the actual data processing (e.g. 'RUN57'). This string is added to product files. MULTIPROCESSING_DSET : Bool If true the generation of datasets at the same processing level will be parallelized MULTIPROCESSING_PROD : Bool If true the generation of products from each dataset will be parallelized PROFILE_MULTIPROCESSING : Bool If true and code parallelized the multiprocessing is profiled """ print("- PYRAD version: %s (compiled %s by %s)" % (pyrad_version.version, pyrad_version.compile_date_time, pyrad_version.username)) print("- PYART version: " + pyart_version.version) # Define behaviour of warnings warnings.simplefilter('always') # always print matching warnings # warnings.simplefilter('error') # turn matching warnings into exceptions warnings.formatwarning = _warning_format # define format if ALLOW_USER_BREAK: input_queue = _initialize_listener() if not _DASK_AVAILABLE: MULTIPROCESSING_DSET = False MULTIPROCESSING_PROD = False PROFILE_MULTIPROCESSING = False # check if multiprocessing profiling is necessary if not MULTIPROCESSING_DSET and not MULTIPROCESSING_PROD: PROFILE_MULTIPROCESSING = False elif MULTIPROCESSING_DSET and MULTIPROCESSING_PROD: PROFILE_MULTIPROCESSING = False if MULTIPROCESSING_DSET and MULTIPROCESSING_PROD: # necessary to launch tasks from tasks Client() if PROFILE_MULTIPROCESSING: prof = Profiler() rprof = ResourceProfiler() cprof = CacheProfiler() prof.register() rprof.register() cprof.register() cfg = _create_cfg_dict(cfgfile) datacfg = _create_datacfg_dict(cfg) starttime, endtime, traj = _get_times_and_traj( trajfile, starttime, endtime, cfg['ScanPeriod'], last_state_file=cfg['lastStateFile'], trajtype=trajtype, flashnr=flashnr) if infostr: print('- Info string : ' + infostr) # get data types and levels datatypesdescr_list = list() for i in range(1, cfg['NumRadars']+1): datatypesdescr_list.append( _get_datatype_list(cfg, radarnr='RADAR'+'{:03d}'.format(i))) dataset_levels = _get_datasets_list(cfg) masterfilelist, masterdatatypedescr, masterscan = _get_masterfile_list( datatypesdescr_list[0], starttime, endtime, datacfg, scan_list=datacfg['ScanList']) nvolumes = len(masterfilelist) if nvolumes == 0: raise ValueError( "ERROR: Could not find any valid volumes between " + starttime.strftime('%Y-%m-%d %H:%M:%S') + " and " + endtime.strftime('%Y-%m-%d %H:%M:%S') + " for " + "master scan '" + str(masterscan) + "' and master data type '" + masterdatatypedescr + "'") print('- Number of volumes to process: ' + str(nvolumes)) print('- Start time: ' + starttime.strftime("%Y-%m-%d %H:%M:%S")) print('- end time: ' + endtime.strftime("%Y-%m-%d %H:%M:%S")) # initial processing of the datasets print('\n\n- Initializing datasets:') dscfg, traj = _initialize_datasets( dataset_levels, cfg, traj=traj, infostr=infostr) # process all data files in file list or until user interrupts processing for masterfile in masterfilelist: if ALLOW_USER_BREAK: # check if user has requested exit try: input_queue.get_nowait() warn('Program terminated by user') break except queue.Empty: pass print('\n- master file: ' + os.path.basename(masterfile)) master_voltime = get_datetime(masterfile, masterdatatypedescr) radar_list = _get_radars_data( master_voltime, datatypesdescr_list, datacfg, num_radars=datacfg['NumRadars']) # process all data sets dscfg, traj = _process_datasets( dataset_levels, cfg, dscfg, radar_list, master_voltime, traj=traj, infostr=infostr, MULTIPROCESSING_DSET=MULTIPROCESSING_DSET, MULTIPROCESSING_PROD=MULTIPROCESSING_PROD) # delete variables del radar_list gc.collect() # post-processing of the datasets print('\n\n- Post-processing datasets:') dscfg, traj = _postprocess_datasets( dataset_levels, cfg, dscfg, traj=traj, infostr=infostr) if PROFILE_MULTIPROCESSING: prof.unregister() rprof.unregister() cprof.unregister() bokeh_plot = visualize([prof, rprof, cprof], show=False, save=False) profile_path = os.path.expanduser('~')+'/profiling/' if not os.path.isdir(profile_path): os.makedirs(profile_path) export_png(bokeh_plot, filename=( profile_path+datetime.utcnow().strftime('%Y%m%d%H%M%S') + '_profile.png')) print('- This is the end my friend! See you soon!')
def compute(self, **kwargs): with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof: self._computed_result = dask.compute(self._result, **kwargs)[0] self._prof = prof self._rprof = rprof self._cprof = cprof
def _main(args): tic = time.time() log.info(banner()) if args.disable_post_mortem: log.warn("Disabling crash debugging with the " "Interactive Python Debugger, as per user request") post_mortem_handler.disable_pdb_on_error() log.info("Flagging on the {0:s} column".format(args.data_column)) data_column = args.data_column masked_channels = [ load_mask(fn, dilate=args.dilate_masks) for fn in collect_masks() ] GD = args.config log_configuration(args) # Group datasets by these columns group_cols = ["FIELD_ID", "DATA_DESC_ID", "SCAN_NUMBER"] # Index datasets by these columns index_cols = ['TIME'] # Reopen the datasets using the aggregated row ordering columns = [data_column, "FLAG", "TIME", "ANTENNA1", "ANTENNA2"] if args.subtract_model_column is not None: columns.append(args.subtract_model_column) xds = list( xds_from_ms(args.ms, columns=tuple(columns), group_cols=group_cols, index_cols=index_cols, chunks={"row": args.row_chunks})) # Get support tables st = support_tables(args.ms) ddid_ds = st["DATA_DESCRIPTION"] field_ds = st["FIELD"] pol_ds = st["POLARIZATION"] spw_ds = st["SPECTRAL_WINDOW"] ant_ds = st["ANTENNA"] assert len(ant_ds) == 1 assert len(ddid_ds) == 1 antspos = ant_ds[0].POSITION.data antsnames = ant_ds[0].NAME.data fieldnames = [fds.NAME.data[0] for fds in field_ds] avail_scans = [ds.SCAN_NUMBER for ds in xds] args.scan_numbers = list( set(avail_scans).intersection(args.scan_numbers if args.scan_numbers is not None else avail_scans)) if args.scan_numbers != []: log.info("Only considering scans '{0:s}' as " "per user selection criterion".format(", ".join( map(str, map(int, args.scan_numbers))))) if args.field_names != []: flatten_field_names = [] for f in args.field_names: # accept comma lists per specification flatten_field_names += [x.strip() for x in f.split(",")] for f in flatten_field_names: if re.match(r"^\d+$", f) and int(f) < len(fieldnames): flatten_field_names.append(fieldnames[int(f)]) flatten_field_names = list( set( filter(lambda x: not re.match(r"^\d+$", x), flatten_field_names))) log.info("Only considering fields '{0:s}' for flagging per " "user " "selection criterion.".format(", ".join(flatten_field_names))) if not set(flatten_field_names) <= set(fieldnames): raise ValueError("One or more fields cannot be " "found in dataset '{0:s}' " "You specified {1:s}, but " "only {2:s} are available".format( args.ms, ",".join(flatten_field_names), ",".join(fieldnames))) field_dict = {fieldnames.index(fn): fn for fn in flatten_field_names} else: field_dict = {i: fn for i, fn in enumerate(fieldnames)} # List which hold our dask compute graphs for each dataset write_computes = [] original_stats = [] final_stats = [] # Iterate through each dataset for ds in xds: if ds.FIELD_ID not in field_dict: continue if (args.scan_numbers is not None and ds.SCAN_NUMBER not in args.scan_numbers): continue log.info("Adding field '{0:s}' scan {1:d} to " "compute graph for processing".format(field_dict[ds.FIELD_ID], ds.SCAN_NUMBER)) ddid = ddid_ds[ds.attrs['DATA_DESC_ID']] spw_info = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]] pol_info = pol_ds[ddid.POLARIZATION_ID.data[0]] nrow, nchan, ncorr = getattr(ds, data_column).data.shape # Visibilities from the dataset vis = getattr(ds, data_column).data if args.subtract_model_column is not None: log.info("Forming residual data between '{0:s}' and " "'{1:s}' for flagging.".format( data_column, args.subtract_model_column)) vismod = getattr(ds, args.subtract_model_column).data vis = vis - vismod antenna1 = ds.ANTENNA1.data antenna2 = ds.ANTENNA2.data chan_freq = spw_info.CHAN_FREQ.data[0] chan_width = spw_info.CHAN_WIDTH.data[0] # Generate unflagged defaults if we should ignore existing flags # otherwise take flags from the dataset if args.ignore_flags is True: flags = da.full_like(vis, False, dtype=np.bool) log.critical("Completely ignoring measurement set " "flags as per '-if' request. " "Strategy WILL NOT or with original flags, even if " "specified!") else: flags = ds.FLAG.data # If we're flagging on polarised intensity, # we convert visibilities to polarised intensity # and any flagged correlation will flag the entire visibility if args.flagging_strategy == "polarisation": corr_type = pol_info.CORR_TYPE.data[0].tolist() stokes_map = stokes_corr_map(corr_type) stokes_pol = tuple(v for k, v in stokes_map.items() if k != "I") vis = polarised_intensity(vis, stokes_pol) flags = da.any(flags, axis=2, keepdims=True) elif args.flagging_strategy == "total_power": if args.subtract_model_column is None: log.critical("You requested to flag total quadrature " "power, but not on residuals. " "This is not advisable and the flagger " "may mistake fringes of " "off-axis sources for broadband RFI.") corr_type = pol_info.CORR_TYPE.data[0].tolist() stokes_map = stokes_corr_map(corr_type) stokes_pol = tuple(v for k, v in stokes_map.items()) vis = polarised_intensity(vis, stokes_pol) flags = da.any(flags, axis=2, keepdims=True) elif args.flagging_strategy == "standard": if args.subtract_model_column is None: log.critical("You requested to flag per correlation, " "but not on residuals. " "This is not advisable and the flagger " "may mistake fringes of off-axis sources " "for broadband RFI.") else: raise ValueError("Invalid flagging strategy '%s'" % args.flagging_strategy) ubl = unique_baselines(antenna1, antenna2) utime, time_inv = da.unique(ds.TIME.data, return_inverse=True) utime, ubl = dask.compute(utime, ubl) ubl = ubl.view(np.int32).reshape(-1, 2) # Stack the baseline index with the unique baselines bl_range = np.arange(ubl.shape[0], dtype=ubl.dtype)[:, None] ubl = np.concatenate([bl_range, ubl], axis=1) ubl = da.from_array(ubl, chunks=(args.baseline_chunks, 3)) vis_windows, flag_windows = pack_data(time_inv, ubl, antenna1, antenna2, vis, flags, utime.shape[0], backend=args.window_backend, path=args.temporary_directory) original_stats.append( window_stats(flag_windows, ubl, chan_freq, antsnames, ds.SCAN_NUMBER, field_dict[ds.FIELD_ID], ds.attrs['DATA_DESC_ID'])) with StrategyExecutor(antspos, ubl, chan_freq, chan_width, masked_channels, GD['strategies']) as se: flag_windows = se.apply_strategies(flag_windows, vis_windows) final_stats.append( window_stats(flag_windows, ubl, chan_freq, antsnames, ds.SCAN_NUMBER, field_dict[ds.FIELD_ID], ds.attrs['DATA_DESC_ID'])) # Unpack window data for writing back to the MS unpacked_flags = unpack_data(antenna1, antenna2, time_inv, ubl, flag_windows) # Flag entire visibility if any correlations are flagged equalized_flags = da.sum(unpacked_flags, axis=2, keepdims=True) > 0 corr_flags = da.broadcast_to(equalized_flags, (nrow, nchan, ncorr)) if corr_flags.chunks != ds.FLAG.data.chunks: raise ValueError("Output flag chunking does not " "match input flag chunking") # Create new dataset containing new flags new_ds = ds.assign(FLAG=(("row", "chan", "corr"), corr_flags)) # Write back to original dataset writes = xds_to_table(new_ds, args.ms, "FLAG") # original should also have .compute called because we need stats write_computes.append(writes) if len(write_computes) > 0: # Combine stats from all datasets original_stats = combine_window_stats(original_stats) final_stats = combine_window_stats(final_stats) with contextlib.ExitStack() as stack: # Create dask profiling contexts profilers = [] if can_profile: profilers.append(stack.enter_context(Profiler())) profilers.append(stack.enter_context(CacheProfiler())) profilers.append(stack.enter_context(ResourceProfiler())) if sys.stdout.isatty(): # Interactive terminal, default ProgressBar stack.enter_context(ProgressBar()) else: # Non-interactive, emit a bar every 5 minutes so # as not to spam the log stack.enter_context(ProgressBar(minimum=1, dt=5 * 60)) _, original_stats, final_stats = dask.compute( write_computes, original_stats, final_stats) if can_profile: visualize(profilers) toc = time.time() # Log each summary line for line in summarise_stats(final_stats, original_stats): log.info(line) elapsed = toc - tic log.info("Data flagged successfully in " "{0:02.0f}h{1:02.0f}m{2:02.0f}s".format((elapsed // 60) // 60, (elapsed // 60) % 60, elapsed % 60)) else: log.info("User data selection criteria resulted in empty dataset. " "Nothing to be done. Bye!")
result = (da_input**2. + da_input**3.).mean(axis=0) result # %% [markdown] # ### Note that result hasn't been computed yet # # Here is a graph of how the calculation will be split among 4 threads # %% from dask.dot import dot_graph dot_graph(result.dask) # %% [markdown] # ### Now do the calculation # %% with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof,\ CacheProfiler() as cprof: answer = result.compute() # %% [markdown] # Visualize the cpu, memory and cache for the 4 threads # %% visualize([prof, rprof, cprof], min_border_top=15, min_border_bottom=15) # %% [markdown] # ### You can evaluate your own functions on dask arrays # # If your functons release the GIL, you can get multithreaded computation using [dask.delayed](http://dask.pydata.org/en/latest/delayed.html)
def rechunk_vanilla_dask(indir_path, outdir_path, nthreads, R, O, model): """ Rechunk using vanilla dask """ in_arrays = load_input_files(indir_path) case = Merge('samplename') case.merge_hdf5_multiple(indir_path, store=False) reconstructed_array = case.get() out_files = list() # to keep outfiles open during processing sources = list() targets = list() outfiles_partition = get_blocks_shape(R, O) for i in range(outfiles_partition[0]): for j in range(outfiles_partition[1]): for k in range(outfiles_partition[2]): out_filename = f'{i}_{j}_{k}.hdf5' out_file = h5py.File(os.path.join(outdir_path, out_filename), 'w') dset = out_file.create_dataset('/data', shape=O, dtype=np.float16) tmp_array = reconstructed_array[i * O[0]:(i + 1) * O[0], j * O[1]:(j + 1) * O[1], k * O[2]:(k + 1) * O[2]] print( f'{i*O[0]}: {(i+1)*O[0]}, {j*O[1]}: {(j+1)*O[1]}, {k*O[2]}: {(k+1)*O[2]}' ) out_files.append(out_file) sources.append(tmp_array) targets.append(dset) rechunk_task = da.store(sources, targets, compute=False) # rechunk_task.visualize(filename="tmp_dir/test_graph_vanilla.png") # sys.exit() with Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: scheduler = 'single-threaded' if nthreads == 1 else 'threads' with dask.config.set(scheduler=scheduler): try: t = time.time() rechunk_task.compute() t = time.time() - t # visualize([prof, rprof, cprof]) except Exception as e: print(e, "\nSomething went wrong during graph execution.") t = None diagnostics = os.path.join(outdir_path, 'exp5_' + str(model) + '.html') visualize([prof, rprof, cprof], diagnostics, show=False) clean_files() for f in out_files: f.close() return t
import dask.array as da from dask.diagnostics import CacheProfiler from cachey import nbytes if __name__ == '__main__': a = da.random.normal(size=(1000, 10000), chunks=(1000, 1000)) res = a.dot(a.T).mean(axis=0) with CacheProfiler(metric=nbytes) as rprof: out = res.compute() rprof.visualize() # for res in rprof.results: # print(res)
import sys import time import dask from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize from multiprocessing.pool import ThreadPool import hyperspy.api as hs emd_filename_list = sys.argv[1:] emd_filename_list.sort() with dask.set_options(pool=ThreadPool(8)), Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof: for emd_filename in emd_filename_list: s = hs.load(emd_filename, lazy=True).transpose(signal_axes=(2, 3)) t0 = time.time() result = s.sum() print(emd_filename) delta = time.time() - t0 print(delta) print("{} MB/s".format(s.data.nbytes / delta / 1024 / 1024)) visualize([prof, rprof, cprof])
def test_cache_profiler_plot_with_invalid_bokeh_kwarg_raises_error(): with CacheProfiler(metric_name="non-standard") as cprof: get(dsk, "e") with pytest.raises(AttributeError, match="foo_bar"): cprof.visualize(foo_bar="fake")
import sys import time import dask from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize from multiprocessing.pool import ThreadPool import hyperspy.api as hs emd_filename_list = sys.argv[1:] emd_filename_list.sort() with dask.set_options( pool=ThreadPool(8)), Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: for emd_filename in emd_filename_list: s = hs.load(emd_filename, lazy=True).transpose(signal_axes=(2, 3)) t0 = time.time() result = s.sum() print(emd_filename) delta = time.time() - t0 print(delta) print(f"{s.data.nbytes / delta / 1024 / 1024} MB/s") visualize([prof, rprof, cprof])
times = list() for buffer in buffers_to_test: print("RUNNING BUFFER ", buffer) with h5py.File(input_filepath, 'r') as f_in: # open original array dset = f_in['/data'] in_arr = da.from_array(dset, chunks=split_cs) with h5py.File(output_filepath, 'x') as f_out: # open split array # run optimized split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None) print("RUNNING OPTIMIZED") enable_clustering(buffer) flush_cache() with Profiler() as prof, ResourceProfiler( ) as rprof, CacheProfiler(metric=nbytes) as cprof: with dask.config.set(scheduler='single-threaded'): t = time.time() _ = split_arr.compute() t = time.time() - t times.append([buffer, t, "optimized"]) visualize([prof, rprof, cprof], os.path.join(output_directory, str(buffer) + "opti" + ".html"), show=False) os.remove(output_filepath) # remove output file for next run with h5py.File(output_filepath, 'x') as f_out: # open split array # run non optimized split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None) print("RUNNING NON OPTIMIZED")