Python CacheProfilerの例、dask.diagnostics.CacheProfiler Pythonの例

コード例 #1

0

ファイルを表示

def test_cache_profiler_plot():
    with CacheProfiler(metric_name="non-standard") as cprof:
        get(dsk, "e")
    p = cprof.visualize(
        width=500,
        height=300,
        tools="hover",
        title="Not the default",
        show=False,
        save=False,
    )
    if BOKEH_VERSION().major < 3:
        assert p.plot_width == 500
        assert p.plot_height == 300
    else:
        assert p.width == 500
        assert p.height == 300
    assert len(p.tools) == 1
    assert isinstance(p.tools[0], bokeh.models.HoverTool)
    assert p.title.text == "Not the default"
    assert p.axis[1].axis_label == "Cache Size (non-standard)"
    # Test empty, checking for errors
    cprof.clear()
    with warnings.catch_warnings(record=True) as record:
        cprof.visualize(show=False, save=False)
    assert not record

コード例 #2

0

ファイルを表示

def main():

    global sky
    global dirty
    global psf
     
    list_schedule = []
    list_compute = []
    list_total = []
    list_load = []
   
    start_time1 = time.time()
    sky_npy, sky = load_data(os.path.split(os.getcwd())[0] + '/sky.npy')
    dirty_npy, dirty = load_data(os.path.split(os.getcwd())[0] + '/dirty.npy')
    psf_npy, psf = load_data(os.path.split(os.getcwd())[0] + '/psf.npy')
    end_time1 = time.time()
        
    start_time2 = time.time()
    scheduling()
    end_time2 = time.time()

    pbar = ProgressBar()
	
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof:	
        start_time3 = time.time()        
        hub.compute()
        end_time3 = time.time()
	
    #pbar.register()
    #quad.compute()
    #pbar.unregister()	
	
    with PrintKeys():
        hub.compute()

    print("\n" + "Resultats du profilling:")	
    print(prof.results[0])
    print("\n" + "La valeur d'usage de la memoire est en MB et l'information du CPU est %d'usage de la CPU")	
    print(rprof.results)
    print("\n" + "Resultats du profilling de la cache:")
    print(cprof.results[0])

    visualize([prof, rprof, cprof])

    list_load.append(end_time1 - start_time1)
    list_schedule.append(end_time2 - start_time2)
    list_compute.append(end_time3 - start_time3)
    list_total.append(end_time3 - start_time1)    

    print("\n" + "Temps du code pous analyse")
    print('load time: {}'.format(round(sum(list_load)/len(list_load), 4)))
    print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4)))
    print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4)))
    print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))

コード例 #3

0

ファイルを表示

ファイル: test_profiler.py プロジェクト: nkhuyu/dask

def test_cache_profiler():
    with CacheProfiler() as cprof:
        out = get(dsk2, 'c')
    results = cprof.results
    assert all(isinstance(i, tuple) and len(i) == 5 for i in results)

    cprof.clear()
    assert cprof.results == []

    tics = [0]
    def nbytes(res):
        tics[0] += 1
        return tics[0]

    with CacheProfiler(nbytes) as cprof:
        out = get(dsk2, 'c')
    results = cprof.results
    assert tics[-1] == len(results)
    assert tics[-1] == results[-1].metric
    assert cprof._metric_name == 'nbytes'
    assert CacheProfiler(metric=nbytes, metric_name='foo')._metric_name == 'foo'

コード例 #4

0

ファイルを表示

ファイル: experiment2.py プロジェクト: GTimothee/dask_io_experiments

def uncompress_to_hdf5():
    print('Writing to hdf5 file after loading raw data in RAM.')

    raw_arr = uncompress()

    # create dask array from data in RAM
    arr = da.from_array(raw_arr, chunks=(1400, 1400, 350))

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath, 'data', arr, chunks=None)

        print(
            f'time to save the array to hdf5 without compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    os.remove(out_filepath)

    out_file_path = "outputs/load_raw_write_hdf5_commpressed.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip")

        print(
            f'time to save the array to hdf5 with compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)

コード例 #5

0

ファイルを表示

ファイル: experiment2.py プロジェクト: GTimothee/dask_io_experiments

def onthefly_to_nps():
    print('Writing to npy stack file without loading raw data in RAM.')

    out_dir = 'data/out_3_numpy'
    out_file_path = "outputs/write_npy_stack.html"

    # write to numpy stack
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        write_to_npy_stack(out_dir, arr)

        print(f'time to save the array to numpy stack: {time.time() - t}')
        visualize([prof, rprof, cprof], out_file_path)

コード例 #6

0

ファイルを表示

ファイル: test_profiler.py プロジェクト: EnjoyLifeFund/macSierra-py36-pkgs

def test_cache_profiler_plot():
    with CacheProfiler(metric_name='non-standard') as cprof:
        get(dsk, 'e')
    p = cprof.visualize(plot_width=500,
                        plot_height=300,
                        tools="hover",
                        title="Not the default",
                        show=False, save=False)
    assert p.plot_width == 500
    assert p.plot_height == 300
    assert len(p.tools) == 1
    assert isinstance(p.tools[0], bokeh.models.HoverTool)
    assert check_title(p, "Not the default")
    assert p.axis[1].axis_label == 'Cache Size (non-standard)'
    # Test empty, checking for errors
    cprof.clear()
    cprof.visualize(show=False, save=False)

コード例 #7

0

ファイルを表示

ファイル: daskexecutor.py プロジェクト: tacaswell/Xi-cam.core

    def execute(self, wf, client):
        if not wf.processes:
            return {}

        dsk = wf.convertGraph()

        with Profiler() as prof, ResourceProfiler(
                dt=0.25) as rprof, CacheProfiler() as cprof:
            result = client.get(dsk[0], dsk[1])

        msg.logMessage('result:', result, level=msg.DEBUG)
        path = user_config_dir('xicam/profile.html')
        visualize([prof, rprof, cprof], show=False, file_path=path)
        msg.logMessage(f'Profile saved: {path}')

        wf.lastresult = result

        return result

コード例 #8

0

ファイルを表示

ファイル: experiment2.py プロジェクト: GTimothee/dask_io_experiments

def uncompress_to_npy():
    print('Writing to numpy file after loading raw data in RAM.')
    out_filepath = 'data/out_1.npy'
    diagnostics_filepath = "outputs/load_raw_write_npy_file.html"

    raw_arr = uncompress()

    # write to numpy file
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        np.save(out_filepath, raw_arr)

        print(f'time to save the array to numpy file: {time.time() - t}')
        visualize([prof, rprof, cprof], diagnostics_filepath)

コード例 #9

0

ファイルを表示

    def _execute_graph(self, *writes):
        # Set up Profilers and Progress Bars
        with ExitStack() as stack:
            profilers = []

            if can_profile:
                from dask.diagnostics import (Profiler, CacheProfiler,
                                              ResourceProfiler, visualize)

                profilers.append(stack.enter_context(Profiler()))
                profilers.append(stack.enter_context(CacheProfiler()))
                profilers.append(stack.enter_context(ResourceProfiler()))

            if sys.stdout.isatty() and not self.args.boring:
                from dask.diagnostics import ProgressBar
                stack.enter_context(ProgressBar())
            dask.compute(*writes, scheduler='single-threaded')
            logger.info("Averaging Complete")

        if can_profile:
            visualize(profilers)

コード例 #10

0

ファイルを表示

ファイル: experiment2.py プロジェクト: GTimothee/dask_io_experiments

def uncompress_to_nps():
    print('Writing to numpy stack after loading raw data in RAM.')

    # load data in RAM
    raw_arr = uncompress()

    # create dask array from data in RAM
    arr = da.from_array(raw_arr, chunks=(1400, 1400, 350))

    # write to numpy stack
    out_dir = 'data/out_numpy'

    out_file_path = "outputs/load_raw_write_npy_stack.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        write_to_npy_stack(out_dir, arr)

        print(f'time to save the array to numpy stack: {time.time() - t}')
        visualize([prof, rprof, cprof], out_file_path)

コード例 #11

0

ファイルを表示

ファイル: test_profiler.py プロジェクト: gjoseph92/dask

def test_cache_profiler_plot():
    with CacheProfiler(metric_name="non-standard") as cprof:
        get(dsk, "e")
    p = cprof.visualize(
        plot_width=500,
        plot_height=300,
        tools="hover",
        title="Not the default",
        show=False,
        save=False,
    )
    assert p.plot_width == 500
    assert p.plot_height == 300
    assert len(p.tools) == 1
    assert isinstance(p.tools[0], bokeh.models.HoverTool)
    assert p.title.text == "Not the default"
    assert p.axis[1].axis_label == "Cache Size (non-standard)"
    # Test empty, checking for errors
    cprof.clear()
    with pytest.warns(None) as record:
        cprof.visualize(show=False, save=False)

    assert len(record) == 0

コード例 #12

0

ファイルを表示

ファイル: experiment2.py プロジェクト: GTimothee/dask_io_experiments

def onthefly_to_hdf5():
    print('Writing to hdf5 file without loading raw data in RAM.')

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    out_file_path = "outputs/write_hdf5.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath,
                   'data',
                   arr,
                   chunks=(1400, 1400, 350),
                   compression="gzip")

        print(
            f'time to save the array to hdf5 with compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)

コード例 #13

0

ファイルを表示

ファイル: combined.py プロジェクト: usnistgov/MRaDS-2017-Demo-Study

dsk = {}
files = sorted(glob.glob("{0}/*.tif".format(data_path)))
final_saves = []
for filename in files:
    filename_cleaned = filename.split("/")[-1].split(".")[0]
    dsk['threshold-{0}'.format(filename_cleaned)] = (threshold, filename)
    dsk['min_size-{0}'.format(filename_cleaned)] = (
        min_size, 'threshold-{0}'.format(filename_cleaned))
    dsk['clean-{0}'.format(filename_cleaned)] = (
        clean, 'min_size-{0}'.format(filename_cleaned))
    dsk['reveal-{0}'.format(filename_cleaned)] = (
        reveal, 'clean-{0}'.format(filename_cleaned))
    dsk['pearlite-{0}'.format(filename_cleaned)] = (
        pearlite, 'reveal-{0}'.format(filename_cleaned))
    dsk['ferrite-{0}'.format(filename_cleaned)] = (
        ferrite, 'pearlite-{0}'.format(filename_cleaned))
    dsk['cemmentite-{0}'.format(filename_cleaned)] = (
        cemmentite, 'ferrite-{0}'.format(filename_cleaned))
    dsk['save-{0}'.format(filename_cleaned)] = (
        save, 'cemmentite-{0}'.format(filename_cleaned))
    final_saves.append('save-{0}'.format(filename_cleaned))
dsk['finalize'] = (finalize, final_saves)

dot_graph(dsk)

with ResourceProfiler(0.25) as rprof, Profiler() as prof, CacheProfiler(
) as cprof, ProgressBar():
    dak_get(dsk, 'finalize')

visualize([prof, rprof, cprof])

コード例 #14

0

ファイルを表示

ファイル: exp_mp_extract_feat_vecs_dblp.py プロジェクト: kvpradap/dmagellan

sys.path.append('/Users/pradap/Documents/Research/Python-Package/scaling/dmagellan')

from dmagellan.feature.extractfeatures import extract_feature_vecs
from dmagellan.feature.autofeaturegen import get_features_for_matching

from dask import multiprocessing, threaded
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize
import cloudpickle
filename='./profres_exp_mt_dblp_300k_extractfeatvecs.html'

pbar = ProgressBar()
pbar.register()

#print("Mem. usage before reading:{0}".format( psutil.virtual_memory().used/1e9))
A = pd.read_csv('./datasets/sample_citeseer_300k.csv')
B = pd.read_csv('./datasets/sample_dblp_300k.csv')
#print("Mem. usage after reading:{0}".format(psutil.virtual_memory().used/1e9))

C = pd.read_csv('./datasets/candset.csv')

feature_table = get_features_for_matching(A, B)

feature_vecs = extract_feature_vecs(C, A, B, '_id', 'l_id',  'r_id', 'id', 'id', feature_table=feature_table,
        nchunks=4, compute=False)

with Profiler() as prof, CacheProfiler() as cprof, ResourceProfiler(dt=0.25) as rprof:
    D = feature_vecs.compute(get=threaded.get, num_workers=4)


visualize([prof, cprof, rprof], file_path=filename, show=False)

コード例 #15

0

ファイルを表示

def main(cfgfile, starttime=None, endtime=None, trajfile="", trajtype='plane',
         flashnr=0, infostr="", MULTIPROCESSING_DSET=False,
         MULTIPROCESSING_PROD=False, PROFILE_MULTIPROCESSING=False):
    """
    Main flow control. Processes radar data off-line over a period of time
    given either by the user, a trajectory file, or determined by the last
    volume processed and the current time. Multiple radars can be processed
    simultaneously

    Parameters
    ----------
    cfgfile : str
        path of the main config file
    starttime, endtime : datetime object
        start and end time of the data to be processed
    trajfile : str
        path to file describing the trajectory
    trajtype : str
        type of trajectory file. Can be either 'plane' or 'lightning'
    flashnr : int
        If larger than 0 will select a flash in a lightning trajectory file.
        If 0 the data corresponding to the trajectory of all flashes will be
        plotted
    infostr : str
        Information string about the actual data processing
        (e.g. 'RUN57'). This string is added to product files.
    MULTIPROCESSING_DSET : Bool
        If true the generation of datasets at the same processing level will
        be parallelized
    MULTIPROCESSING_PROD : Bool
        If true the generation of products from each dataset will be
        parallelized
    PROFILE_MULTIPROCESSING : Bool
        If true and code parallelized the multiprocessing is profiled

    """
    print("- PYRAD version: %s (compiled %s by %s)" %
          (pyrad_version.version, pyrad_version.compile_date_time,
           pyrad_version.username))
    print("- PYART version: " + pyart_version.version)

    # Define behaviour of warnings
    warnings.simplefilter('always')  # always print matching warnings
    # warnings.simplefilter('error')  # turn matching warnings into exceptions
    warnings.formatwarning = _warning_format  # define format

    if ALLOW_USER_BREAK:
        input_queue = _initialize_listener()

    if not _DASK_AVAILABLE:
        MULTIPROCESSING_DSET = False
        MULTIPROCESSING_PROD = False
        PROFILE_MULTIPROCESSING = False

    # check if multiprocessing profiling is necessary
    if not MULTIPROCESSING_DSET and not MULTIPROCESSING_PROD:
        PROFILE_MULTIPROCESSING = False
    elif MULTIPROCESSING_DSET and MULTIPROCESSING_PROD:
        PROFILE_MULTIPROCESSING = False

    if MULTIPROCESSING_DSET and MULTIPROCESSING_PROD:
        # necessary to launch tasks from tasks
        Client()

    if PROFILE_MULTIPROCESSING:
        prof = Profiler()
        rprof = ResourceProfiler()
        cprof = CacheProfiler()

        prof.register()
        rprof.register()
        cprof.register()

    cfg = _create_cfg_dict(cfgfile)
    datacfg = _create_datacfg_dict(cfg)

    starttime, endtime, traj = _get_times_and_traj(
        trajfile, starttime, endtime, cfg['ScanPeriod'],
        last_state_file=cfg['lastStateFile'], trajtype=trajtype,
        flashnr=flashnr)

    if infostr:
        print('- Info string : ' + infostr)

    # get data types and levels
    datatypesdescr_list = list()
    for i in range(1, cfg['NumRadars']+1):
        datatypesdescr_list.append(
            _get_datatype_list(cfg, radarnr='RADAR'+'{:03d}'.format(i)))

    dataset_levels = _get_datasets_list(cfg)

    masterfilelist, masterdatatypedescr, masterscan = _get_masterfile_list(
        datatypesdescr_list[0], starttime, endtime, datacfg,
        scan_list=datacfg['ScanList'])

    nvolumes = len(masterfilelist)
    if nvolumes == 0:
        raise ValueError(
            "ERROR: Could not find any valid volumes between " +
            starttime.strftime('%Y-%m-%d %H:%M:%S') + " and " +
            endtime.strftime('%Y-%m-%d %H:%M:%S') + " for " +
            "master scan '" + str(masterscan) +
            "' and master data type '" + masterdatatypedescr +
            "'")
    print('- Number of volumes to process: ' + str(nvolumes))
    print('- Start time: ' + starttime.strftime("%Y-%m-%d %H:%M:%S"))
    print('- end time: ' + endtime.strftime("%Y-%m-%d %H:%M:%S"))

    # initial processing of the datasets
    print('\n\n- Initializing datasets:')
    dscfg, traj = _initialize_datasets(
        dataset_levels, cfg, traj=traj, infostr=infostr)

    # process all data files in file list or until user interrupts processing
    for masterfile in masterfilelist:
        if ALLOW_USER_BREAK:
            # check if user has requested exit
            try:
                input_queue.get_nowait()
                warn('Program terminated by user')
                break
            except queue.Empty:
                pass

        print('\n- master file: ' + os.path.basename(masterfile))

        master_voltime = get_datetime(masterfile, masterdatatypedescr)

        radar_list = _get_radars_data(
            master_voltime, datatypesdescr_list, datacfg,
            num_radars=datacfg['NumRadars'])

        # process all data sets
        dscfg, traj = _process_datasets(
            dataset_levels, cfg, dscfg, radar_list, master_voltime, traj=traj,
            infostr=infostr, MULTIPROCESSING_DSET=MULTIPROCESSING_DSET,
            MULTIPROCESSING_PROD=MULTIPROCESSING_PROD)

        # delete variables
        del radar_list

        gc.collect()

    # post-processing of the datasets
    print('\n\n- Post-processing datasets:')
    dscfg, traj = _postprocess_datasets(
        dataset_levels, cfg, dscfg, traj=traj, infostr=infostr)

    if PROFILE_MULTIPROCESSING:
        prof.unregister()
        rprof.unregister()
        cprof.unregister()

        bokeh_plot = visualize([prof, rprof, cprof], show=False, save=False)

        profile_path = os.path.expanduser('~')+'/profiling/'
        if not os.path.isdir(profile_path):
            os.makedirs(profile_path)

        export_png(bokeh_plot, filename=(
            profile_path+datetime.utcnow().strftime('%Y%m%d%H%M%S') +
            '_profile.png'))

    print('- This is the end my friend! See you soon!')

コード例 #16

0

ファイルを表示

 def compute(self, **kwargs):
     with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
         self._computed_result = dask.compute(self._result, **kwargs)[0]
         self._prof = prof
         self._rprof = rprof
         self._cprof = cprof

コード例 #17

0

ファイルを表示

def _main(args):
    tic = time.time()

    log.info(banner())

    if args.disable_post_mortem:
        log.warn("Disabling crash debugging with the "
                 "Interactive Python Debugger, as per user request")
        post_mortem_handler.disable_pdb_on_error()

    log.info("Flagging on the {0:s} column".format(args.data_column))
    data_column = args.data_column
    masked_channels = [
        load_mask(fn, dilate=args.dilate_masks) for fn in collect_masks()
    ]
    GD = args.config

    log_configuration(args)

    # Group datasets by these columns
    group_cols = ["FIELD_ID", "DATA_DESC_ID", "SCAN_NUMBER"]
    # Index datasets by these columns
    index_cols = ['TIME']

    # Reopen the datasets using the aggregated row ordering
    columns = [data_column, "FLAG", "TIME", "ANTENNA1", "ANTENNA2"]

    if args.subtract_model_column is not None:
        columns.append(args.subtract_model_column)

    xds = list(
        xds_from_ms(args.ms,
                    columns=tuple(columns),
                    group_cols=group_cols,
                    index_cols=index_cols,
                    chunks={"row": args.row_chunks}))

    # Get support tables
    st = support_tables(args.ms)
    ddid_ds = st["DATA_DESCRIPTION"]
    field_ds = st["FIELD"]
    pol_ds = st["POLARIZATION"]
    spw_ds = st["SPECTRAL_WINDOW"]
    ant_ds = st["ANTENNA"]

    assert len(ant_ds) == 1
    assert len(ddid_ds) == 1

    antspos = ant_ds[0].POSITION.data
    antsnames = ant_ds[0].NAME.data
    fieldnames = [fds.NAME.data[0] for fds in field_ds]

    avail_scans = [ds.SCAN_NUMBER for ds in xds]
    args.scan_numbers = list(
        set(avail_scans).intersection(args.scan_numbers if args.scan_numbers
                                      is not None else avail_scans))

    if args.scan_numbers != []:
        log.info("Only considering scans '{0:s}' as "
                 "per user selection criterion".format(", ".join(
                     map(str, map(int, args.scan_numbers)))))

    if args.field_names != []:
        flatten_field_names = []
        for f in args.field_names:
            # accept comma lists per specification
            flatten_field_names += [x.strip() for x in f.split(",")]
        for f in flatten_field_names:
            if re.match(r"^\d+$", f) and int(f) < len(fieldnames):
                flatten_field_names.append(fieldnames[int(f)])
        flatten_field_names = list(
            set(
                filter(lambda x: not re.match(r"^\d+$", x),
                       flatten_field_names)))
        log.info("Only considering fields '{0:s}' for flagging per "
                 "user "
                 "selection criterion.".format(", ".join(flatten_field_names)))
        if not set(flatten_field_names) <= set(fieldnames):
            raise ValueError("One or more fields cannot be "
                             "found in dataset '{0:s}' "
                             "You specified {1:s}, but "
                             "only {2:s} are available".format(
                                 args.ms, ",".join(flatten_field_names),
                                 ",".join(fieldnames)))

        field_dict = {fieldnames.index(fn): fn for fn in flatten_field_names}
    else:
        field_dict = {i: fn for i, fn in enumerate(fieldnames)}

    # List which hold our dask compute graphs for each dataset
    write_computes = []
    original_stats = []
    final_stats = []

    # Iterate through each dataset
    for ds in xds:
        if ds.FIELD_ID not in field_dict:
            continue

        if (args.scan_numbers is not None
                and ds.SCAN_NUMBER not in args.scan_numbers):
            continue

        log.info("Adding field '{0:s}' scan {1:d} to "
                 "compute graph for processing".format(field_dict[ds.FIELD_ID],
                                                       ds.SCAN_NUMBER))

        ddid = ddid_ds[ds.attrs['DATA_DESC_ID']]
        spw_info = spw_ds[ddid.SPECTRAL_WINDOW_ID.data[0]]
        pol_info = pol_ds[ddid.POLARIZATION_ID.data[0]]

        nrow, nchan, ncorr = getattr(ds, data_column).data.shape

        # Visibilities from the dataset
        vis = getattr(ds, data_column).data
        if args.subtract_model_column is not None:
            log.info("Forming residual data between '{0:s}' and "
                     "'{1:s}' for flagging.".format(
                         data_column, args.subtract_model_column))
            vismod = getattr(ds, args.subtract_model_column).data
            vis = vis - vismod

        antenna1 = ds.ANTENNA1.data
        antenna2 = ds.ANTENNA2.data
        chan_freq = spw_info.CHAN_FREQ.data[0]
        chan_width = spw_info.CHAN_WIDTH.data[0]

        # Generate unflagged defaults if we should ignore existing flags
        # otherwise take flags from the dataset
        if args.ignore_flags is True:
            flags = da.full_like(vis, False, dtype=np.bool)
            log.critical("Completely ignoring measurement set "
                         "flags as per '-if' request. "
                         "Strategy WILL NOT or with original flags, even if "
                         "specified!")
        else:
            flags = ds.FLAG.data

        # If we're flagging on polarised intensity,
        # we convert visibilities to polarised intensity
        # and any flagged correlation will flag the entire visibility
        if args.flagging_strategy == "polarisation":
            corr_type = pol_info.CORR_TYPE.data[0].tolist()
            stokes_map = stokes_corr_map(corr_type)
            stokes_pol = tuple(v for k, v in stokes_map.items() if k != "I")
            vis = polarised_intensity(vis, stokes_pol)
            flags = da.any(flags, axis=2, keepdims=True)
        elif args.flagging_strategy == "total_power":
            if args.subtract_model_column is None:
                log.critical("You requested to flag total quadrature "
                             "power, but not on residuals. "
                             "This is not advisable and the flagger "
                             "may mistake fringes of "
                             "off-axis sources for broadband RFI.")
            corr_type = pol_info.CORR_TYPE.data[0].tolist()
            stokes_map = stokes_corr_map(corr_type)
            stokes_pol = tuple(v for k, v in stokes_map.items())
            vis = polarised_intensity(vis, stokes_pol)
            flags = da.any(flags, axis=2, keepdims=True)
        elif args.flagging_strategy == "standard":
            if args.subtract_model_column is None:
                log.critical("You requested to flag per correlation, "
                             "but not on residuals. "
                             "This is not advisable and the flagger "
                             "may mistake fringes of off-axis sources "
                             "for broadband RFI.")
        else:
            raise ValueError("Invalid flagging strategy '%s'" %
                             args.flagging_strategy)

        ubl = unique_baselines(antenna1, antenna2)
        utime, time_inv = da.unique(ds.TIME.data, return_inverse=True)
        utime, ubl = dask.compute(utime, ubl)
        ubl = ubl.view(np.int32).reshape(-1, 2)
        # Stack the baseline index with the unique baselines
        bl_range = np.arange(ubl.shape[0], dtype=ubl.dtype)[:, None]
        ubl = np.concatenate([bl_range, ubl], axis=1)
        ubl = da.from_array(ubl, chunks=(args.baseline_chunks, 3))

        vis_windows, flag_windows = pack_data(time_inv,
                                              ubl,
                                              antenna1,
                                              antenna2,
                                              vis,
                                              flags,
                                              utime.shape[0],
                                              backend=args.window_backend,
                                              path=args.temporary_directory)

        original_stats.append(
            window_stats(flag_windows, ubl, chan_freq, antsnames,
                         ds.SCAN_NUMBER, field_dict[ds.FIELD_ID],
                         ds.attrs['DATA_DESC_ID']))

        with StrategyExecutor(antspos, ubl, chan_freq, chan_width,
                              masked_channels, GD['strategies']) as se:

            flag_windows = se.apply_strategies(flag_windows, vis_windows)

        final_stats.append(
            window_stats(flag_windows, ubl, chan_freq, antsnames,
                         ds.SCAN_NUMBER, field_dict[ds.FIELD_ID],
                         ds.attrs['DATA_DESC_ID']))

        # Unpack window data for writing back to the MS
        unpacked_flags = unpack_data(antenna1, antenna2, time_inv, ubl,
                                     flag_windows)

        # Flag entire visibility if any correlations are flagged
        equalized_flags = da.sum(unpacked_flags, axis=2, keepdims=True) > 0
        corr_flags = da.broadcast_to(equalized_flags, (nrow, nchan, ncorr))

        if corr_flags.chunks != ds.FLAG.data.chunks:
            raise ValueError("Output flag chunking does not "
                             "match input flag chunking")

        # Create new dataset containing new flags
        new_ds = ds.assign(FLAG=(("row", "chan", "corr"), corr_flags))

        # Write back to original dataset
        writes = xds_to_table(new_ds, args.ms, "FLAG")
        # original should also have .compute called because we need stats
        write_computes.append(writes)

    if len(write_computes) > 0:
        # Combine stats from all datasets
        original_stats = combine_window_stats(original_stats)
        final_stats = combine_window_stats(final_stats)

        with contextlib.ExitStack() as stack:
            # Create dask profiling contexts
            profilers = []

            if can_profile:
                profilers.append(stack.enter_context(Profiler()))
                profilers.append(stack.enter_context(CacheProfiler()))
                profilers.append(stack.enter_context(ResourceProfiler()))

            if sys.stdout.isatty():
                # Interactive terminal, default ProgressBar
                stack.enter_context(ProgressBar())
            else:
                # Non-interactive, emit a bar every 5 minutes so
                # as not to spam the log
                stack.enter_context(ProgressBar(minimum=1, dt=5 * 60))

            _, original_stats, final_stats = dask.compute(
                write_computes, original_stats, final_stats)
        if can_profile:
            visualize(profilers)

        toc = time.time()

        # Log each summary line
        for line in summarise_stats(final_stats, original_stats):
            log.info(line)

        elapsed = toc - tic
        log.info("Data flagged successfully in "
                 "{0:02.0f}h{1:02.0f}m{2:02.0f}s".format((elapsed // 60) // 60,
                                                         (elapsed // 60) % 60,
                                                         elapsed % 60))
    else:
        log.info("User data selection criteria resulted in empty dataset. "
                 "Nothing to be done. Bye!")

コード例 #18

0

ファイルを表示

result = (da_input**2. + da_input**3.).mean(axis=0)
result

# %% [markdown]
# ### Note that result hasn't been computed yet
#
# Here is a graph of how the calculation will be split among 4 threads

# %%
from dask.dot import dot_graph
dot_graph(result.dask)

# %% [markdown]
# ### Now do the calculation

# %%
with Profiler() as prof, ResourceProfiler(dt=0.1) as rprof,\
              CacheProfiler() as cprof:
    answer = result.compute()

# %% [markdown]
# Visualize the cpu, memory and cache for the 4 threads

# %%
visualize([prof, rprof, cprof], min_border_top=15, min_border_bottom=15)

# %% [markdown]
# ### You can evaluate your own functions on dask arrays
#
# If your functons release the GIL, you can get multithreaded computation using [dask.delayed](http://dask.pydata.org/en/latest/delayed.html)

コード例 #19

0

ファイルを表示

def rechunk_vanilla_dask(indir_path, outdir_path, nthreads, R, O, model):
    """ Rechunk using vanilla dask
    """
    in_arrays = load_input_files(indir_path)

    case = Merge('samplename')
    case.merge_hdf5_multiple(indir_path, store=False)
    reconstructed_array = case.get()

    out_files = list()  # to keep outfiles open during processing
    sources = list()
    targets = list()
    outfiles_partition = get_blocks_shape(R, O)
    for i in range(outfiles_partition[0]):
        for j in range(outfiles_partition[1]):
            for k in range(outfiles_partition[2]):
                out_filename = f'{i}_{j}_{k}.hdf5'
                out_file = h5py.File(os.path.join(outdir_path, out_filename),
                                     'w')
                dset = out_file.create_dataset('/data',
                                               shape=O,
                                               dtype=np.float16)

                tmp_array = reconstructed_array[i * O[0]:(i + 1) * O[0],
                                                j * O[1]:(j + 1) * O[1],
                                                k * O[2]:(k + 1) * O[2]]
                print(
                    f'{i*O[0]}: {(i+1)*O[0]}, {j*O[1]}: {(j+1)*O[1]}, {k*O[2]}: {(k+1)*O[2]}'
                )

                out_files.append(out_file)
                sources.append(tmp_array)
                targets.append(dset)

    rechunk_task = da.store(sources, targets, compute=False)
    # rechunk_task.visualize(filename="tmp_dir/test_graph_vanilla.png")
    # sys.exit()

    with Profiler() as prof, ResourceProfiler(
            dt=0.25) as rprof, CacheProfiler() as cprof:
        scheduler = 'single-threaded' if nthreads == 1 else 'threads'

        with dask.config.set(scheduler=scheduler):
            try:
                t = time.time()
                rechunk_task.compute()
                t = time.time() - t
                # visualize([prof, rprof, cprof])
            except Exception as e:
                print(e, "\nSomething went wrong during graph execution.")
                t = None

        diagnostics = os.path.join(outdir_path, 'exp5_' + str(model) + '.html')
        visualize([prof, rprof, cprof], diagnostics, show=False)

    clean_files()

    for f in out_files:
        f.close()

    return t

コード例 #20

0

ファイルを表示

ファイル: cache_profiler.py プロジェクト: SylvainDeker/Distributed-Systems

import dask.array as da
from dask.diagnostics import CacheProfiler
from cachey import nbytes

if __name__ == '__main__':

    a = da.random.normal(size=(1000, 10000), chunks=(1000, 1000))

    res = a.dot(a.T).mean(axis=0)

    with CacheProfiler(metric=nbytes) as rprof:
        out = res.compute()

    rprof.visualize()
    # for res in rprof.results:
    #     print(res)

コード例 #21

0

ファイルを表示

ファイル: emd_sum.py プロジェクト: FENGSHAN95/LiberTEM

import sys
import time
import dask
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
from multiprocessing.pool import ThreadPool
import hyperspy.api as hs

emd_filename_list = sys.argv[1:]
emd_filename_list.sort()

with dask.set_options(pool=ThreadPool(8)), Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
    for emd_filename in emd_filename_list:
        s = hs.load(emd_filename, lazy=True).transpose(signal_axes=(2, 3))
        t0 = time.time()
        result = s.sum()
        print(emd_filename)
        delta = time.time() - t0
        print(delta)
        print("{} MB/s".format(s.data.nbytes / delta / 1024 / 1024))

visualize([prof, rprof, cprof])

コード例 #22

0

ファイルを表示

def test_cache_profiler_plot_with_invalid_bokeh_kwarg_raises_error():
    with CacheProfiler(metric_name="non-standard") as cprof:
        get(dsk, "e")
    with pytest.raises(AttributeError, match="foo_bar"):
        cprof.visualize(foo_bar="fake")

コード例 #23

0

ファイルを表示

ファイル: emd_sum.py プロジェクト: sk1p/LiberTEM

import sys
import time
import dask
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
from multiprocessing.pool import ThreadPool
import hyperspy.api as hs

emd_filename_list = sys.argv[1:]
emd_filename_list.sort()

with dask.set_options(
        pool=ThreadPool(8)), Profiler() as prof, ResourceProfiler(
            dt=0.25) as rprof, CacheProfiler() as cprof:
    for emd_filename in emd_filename_list:
        s = hs.load(emd_filename, lazy=True).transpose(signal_axes=(2, 3))
        t0 = time.time()
        result = s.sum()
        print(emd_filename)
        delta = time.time() - t0
        print(delta)
        print(f"{s.data.nbytes / delta / 1024 / 1024} MB/s")

visualize([prof, rprof, cprof])

コード例 #24

0

ファイルを表示

ファイル: experiment_7.py プロジェクト: GTimothee/dask_io_experiments

    times = list()
    for buffer in buffers_to_test:
        print("RUNNING BUFFER ", buffer)

        with h5py.File(input_filepath, 'r') as f_in:  # open original array
            dset = f_in['/data']
            in_arr = da.from_array(dset, chunks=split_cs)

            with h5py.File(output_filepath, 'x') as f_out:  # open split array
                # run optimized
                split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None)
                print("RUNNING OPTIMIZED")
                enable_clustering(buffer)
                flush_cache()
                with Profiler() as prof, ResourceProfiler(
                ) as rprof, CacheProfiler(metric=nbytes) as cprof:
                    with dask.config.set(scheduler='single-threaded'):
                        t = time.time()
                        _ = split_arr.compute()
                        t = time.time() - t
                        times.append([buffer, t, "optimized"])
                        visualize([prof, rprof, cprof],
                                  os.path.join(output_directory,
                                               str(buffer) + "opti" + ".html"),
                                  show=False)

            os.remove(output_filepath)  # remove output file for next run
            with h5py.File(output_filepath, 'x') as f_out:  # open split array
                # run non optimized
                split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None)
                print("RUNNING NON OPTIMIZED")