Beispiel #1
0
class GraphData:
    def __init__(self, graph_num_bars, dask_address):
        self.dask_client = Client(address = dask_address)        
        
        self.currentValue = {'Memory' :{'total_memory':0,
                                        'used_memory':0},
                             'CPU'    :{'cpu_usage':0},
                             'Cluster':{'n_workers':0,
                                        'total_threads':0},
                             'Workers':[]}
        self.update_dask_values()
        
        # Constants data
        self.mem_max_value = self.currentValue['Memory']['total_memory']
        self.util_max_value = 100
        self.graph_num_bars = graph_num_bars
        
        # Data for graphs
        self.cpu_util = [0] * graph_num_bars
        self.mem_util = [0] * graph_num_bars
        # Data for statistics
        self.n_workers = self.num_workers()
        self.total_mem = self.currentValue['Memory']['total_memory']
        self.used_mem  = self.currentValue['Memory']['used_memory']
                
    def close_con(self):
        self.dask_client.close()

    def update_all(self):
        self.update_dask_values()
        
        self.n_workers = self.num_workers()
        self.mem_max_value = self.currentValue['Memory']['total_memory']
        self.total_mem  = self.currentValue['Memory']['total_memory']
        self.used_mem      = self.currentValue['Memory']['used_memory']
        
        self.cpu_util = self.update_graph_val(self.cpu_util, self.cpu_usage())
        self.mem_util = self.update_graph_val(self.mem_util, self.used_mem)
        
        
    def reset(self):
        self.cpu_util = [0] * self.graph_num_bars
        self.mem_util = [0] * self.graph_num_bars
        
        self.mem_max_value = 0
        self.total_mem  = 0
        self.used_mem      = 0

    def update_graph_val(self, values, new_val):
        values_num = len(values)

        if values_num > self.graph_num_bars:
            values = values[values_num - self.graph_num_bars - 1:]
        elif values_num < self.graph_num_bars:
            zero_pad = [0] * (self.graph_num_bars - values_num)
            values = zero_pad + values

        values.append(new_val)
        return values[1:]

    def update_dask_values(self):
            self.worker_info = self.dask_client.scheduler_info()['workers']
            self.currentValue['Memory']['total_memory'] = round(self.available_memory() / (1024**2),2)
            self.currentValue['Memory']['used_memory']  = round(self.used_memory() / (1024**2),2)
            self.currentValue['Memory']['used_memory_percent']  = self.currentValue['Memory']['used_memory'] / self.currentValue['Memory']['total_memory']
            self.currentValue['CPU']['cpu_usage'] = self.cpu_usage()
            self.currentValue['Cluster']['n_workers'] = self.num_workers()
            self.currentValue['Cluster']['total_threads'] = self.num_workers()
            self.currentValue['Workers'] = self.get_worker_stats()
        
    def num_workers(self):
        return len(self.worker_info)
    
    def num_threads(self):
        threads = [worker['nthreads'] for _, worker in self.worker_info.items()]
        return(sum(threads))
    
    def available_memory(self):
        tots = 0
        for w, info in self.worker_info.items():
            tots += info['memory_limit']
        return tots
    
    def used_memory(self):
        tots = 0
        for w, info in self.worker_info.items():
            tots += info['metrics']['memory']
        return tots
    
    def get_worker_stats(self):
        worker_stats=[]
        for w, info in self.worker_info.items():
            stats = {'user':'******',
                     'id' : 'filler',
                     'name' : 'filler',
                     'rawtime':1,
                     'time':1,
                     'command':'',
                     'cpu':1,
                     'memory':1,
                     'local_ports':'filler'}
            stats['address'] = w
            stats['nthreads'] = info['nthreads']
            stats['memory']   = round(info['metrics']['memory'] / (1024**2),2)
            stats['memory_limit'] = round(info['memory_limit'] / (1024**2), 2)
            stats['cpu']      = info['metrics']['cpu'] 
            stats['read']     =  round(info['metrics']['read_bytes'] / (1024**2), 2)
            stats['write']     = round(info['metrics']['write_bytes'] / (1024**2), 2)
            
            worker_stats.append(stats)
        return worker_stats
    
    def cpu_usage(self):
        """ 
        Average cpu utilization across all workers
        """
        usages = []
        for w, info in self.worker_info.items():
            usages.append(info['metrics']['cpu'])
        if len(usages)>0:
            return sum(usages) / len(usages)
        else:
            return 0
Beispiel #2
0
#!/usr/bin/env python

from dask_jobqueue import SLURMCluster
from dask.distributed import Client

cluster = SLURMCluster(
    cores=28,
    name='test-jobqueue',
    walltime='00:06:00',
    job_extra=['--constraint=HSW24', '--exclusive', '--nodes=1'],
    memory='120GB',
    interface='enp5s0f0')
cluster.scale(196)
cluster

from dask.distributed import Client
client = Client(cluster)
client

print('Currently working with ' +
      str(len(client.scheduler_info()["workers"])) + ' workers')
def all_ping(client: Client):
    workers = list(client.scheduler_info()["workers"])
    start = time.time()
    client.run(ping, workers)
    stop = time.time()
    print(format_time(stop - start))
Beispiel #4
0
        print(f"Initializing Local Dask cluster")
        client = Client()
    else:
        if scheduler is None:
            cluster = EDASCluster()
            print(
                "Initializing Dask-distributed cluster with scheduler address: "
                + cluster.scheduler_address)
            client = Client(cluster.scheduler_address, timeout=60)
            time.sleep(20)
        else:
            print("Initializing client with existing scheduler at: " +
                  scheduler)
            client = Client(scheduler)

    scheduler_info = client.scheduler_info()
    workers: Dict = scheduler_info.pop("workers")
    print(" @@@@@@@ SCHEDULER INFO: " + str(scheduler_info))
    print(f" N Workers: {len(workers)} ")

    start_time1 = time.time()
    job1 = Job.init("Test", "SCHEDULER_TEST", "jobId", domains, variables,
                    operations, [])
    print("Running workflow for requestId " + job1.requestId)
    result1 = edasOpManager.buildTask(job1)
    print("Completed first workflow in time " + str(time.time() - start_time1))

    start_time2 = time.time()
    job2 = Job.init("Test", "SCHEDULER_TEST", "jobId", domains, variables,
                    operations, [])
    print("Running workflow for requestId " + job2.requestId)
Beispiel #5
0
def launch_python_post():
    curDir = os.path.dirname(os.path.abspath(__file__))
    logger = PyPostTools.pyPostLogger()

    logger.write("Initializing WRF Python Post-Processing Program")
    #Step 1: Load program settings
    logger.write(" 1. Application Initalization")
    logger.write("  - Loading control file, python_post_control.txt")
    _pySet = PyPostSettings.PyPostSettings()
    logger.write("  - Success!")
    logger.write("  - Testing Environmental Variables")
    try:
        dask_nodes = os.environ["PYTHON_POST_NODES"]
        dask_threads = os.environ["PYTHON_POST_THREADS"]
        postDir = os.environ["PYTHON_POST_DIR"]
        targetDir = os.environ["PYTHON_POST_TARG_DIR"]
    except KeyError:
        logger.write(
            "***FAIL*** KeyError encountered while trying to access important environmental variables, abort."
        )
        sys.exit("")
    logger.write("  - Success!")
    logger.write("  - Initializing Dask (" + str(dask_nodes) +
                 " Nodes Requested), Collecting routines needed")
    _routines = Routines.Routines()
    # Start Dask Tasks
    #cLoop = IOLoop.current()
    #t = Thread(target = cLoop.start, daemon = True)
    #t.start()

    logger.write("   - Async IO Loop initialized...")

    async def f(port):
        s = Scheduler(port=scheduler_port)
        s = await s
        await s.finished()
        return 1

    asyncio.gather(f(scheduler_port))

    #asyncio.get_event_loop().run_until_complete(f(scheduler_port))

    logger.write("   - Dask Scheduler initialized (Port " +
                 str(scheduler_port) + ")...")
    dask_client = Client("tcp://" + socket.gethostname() + ":" +
                         str(scheduler_port))
    logger.write("   - Dask Client initialized...")
    logger.write("   - Writing Dask Worker Job Files...")
    with PyPostTools.cd(targetDir):
        writeFile1 = PyPostTools.write_job_file(socket.gethostname(),
                                                scheduler_port,
                                                project_name="Nowcast",
                                                queue="default",
                                                nodes=dask_nodes,
                                                wall_time=60,
                                                nProcs=1)
        writeFile2 = PyPostTools.write_worker_file(socket.gethostname(),
                                                   scheduler_port,
                                                   nProcs=1)
        if (writeFile1 == False or writeFile2 == False):
            dask_client.close()
            logger.write(
                "   - Failed to write job files, are you missing an important parameter?"
            )
            sys.exit("")
            return
        else:
            logger.write(
                "   - Dask Worker Job File Written, Submitting to Queue.")
            PyPostTools.popen("chmod +x launch-worker.sh")
            PyPostTools.popen("chmod +x dask-worker.job")
            PyPostTools.popen("qsub dask-worker.job")
    # Wait here for workers.
    logger.write("   -> Worker Job submitted to queue, waiting for workers...")
    while len(dask_client.scheduler_info()['workers']) < int(dask_nodes):
        time.sleep(2)
    logger.write("   -> Workers are now connected.")
    #logger.write("   - Adding local packages to dask workers")
    #dask_client.upload_file("PyPostTools.py")
    #dask_client.upload_file("ArrayTools.py")
    #dask_client.upload_file("Calculation.py")
    #dask_client.upload_file("ColorMaps.py")
    #dask_client.upload_file("Conversions.py")
    #dask_client.upload_file("Plotting.py")
    #dask_client.upload_file("PyPostSettings.py")
    #dask_client.upload_file("Routines.py")
    logger.write("  - Success!")
    logger.write(" 1. Done.")
    logger.write(" 2. Start Post-Processing Calculations")
    calculation_future = start_calculations(dask_client, _routines,
                                            dask_threads)
    if (calculation_future != None):
        wait(calculation_future)
        result_calc = dask_client.gather(calculation_future)[0]
        if (result_calc != 0):
            logger.write(
                "***FAIL*** An error occured in calculations method, check worker logs for more info."
            )
            logger.close()
            sys.exit("")
    logger.write(" 2. Done.")
    logger.write(" 3. Generating Figures")
    logger.write("  - Collecting files from target directory (" + targetDir +
                 ").")
    fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*"))
    logger.write("  - " + str(len(fList3)) + " files have been found.")
    logger.write(" -> Pushing run_plotting_routines() to dask.")
    fullDict = _pySet.get_full_dict()
    plotting_future = start_plotting(dask_client, fullDict, dask_threads)
    wait(plotting_future)
    result_plot = dask_client.gather(plotting_future)[0]
    if (result_plot != 0):
        logger.write(
            "***FAIL*** An error occured in plotting method, check worker logs for more info."
        )
        logger.close()
        sys.exit("")
    logger.write(" 3. Done.")
    logger.write(" 4. Final Steps")

    logger.write(" 4. Done, Closing Dask Client.")
    dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'],
                               close=True)
    dask_client.close()
    logger.write("All Steps Completed.")
    logger.write("***SUCCESS*** Program execution complete.")
    logger.close()
if __name__ == "__main__":
    if use_local_cluster:
        print(f"Creating local cluster with {ncpus_local} workers."
              f" Dashboard address: {dashboard_address}")
        client = Client(
            processes=True,
            dashboard_address=dashboard_address,
            n_workers=ncpus_local,
            threads_per_worker=1,
            memory_limit="4GB",
        )
    else:
        print(f"Creating Slurm cluster at {slurm_cluster_ip}."
              f" Dashboard address: {dashboard_address}")
        client = Client(parameters["slurm_cluster_ip"])
    parameters["ncpus"] = len(client.scheduler_info()["workers"])
    print("Cluster created!")

    datasets = parameters["grouping"].keys()
    # datasets = ["dy_m100_mg"]

    parameters["hist_vars"] = [
        "dimuon_mass",
        "dimuon_pt",
        "dimuon_eta",
        "dimuon_phi",
        "dimuon_dEta",
        "dimuon_dPhi",
        "dimuon_dR",
        "dimuon_rap",
        "dimuon_cos_theta_cs",
ask_workers=2
memory='4GB'
from dask_jobqueue import PBSCluster
from dask.distributed import Client
import dask.dataframe as dd

cluster = PBSCluster(cores=1, memory=memory, project='PerfTestPangeo', walltime='04:00:00')
cluster.scale(ask_workers)

c = Client(cluster)

c

from dask.utils import ensure_dict, format_bytes
    
wk = c.scheduler_info()["workers"]

text="Workers= " + str(len(wk))
memory = [w["memory_limit"] for w in wk.values()]
cores = sum(w["nthreads"] for w in wk.values())
text += ", Cores=" + str(cores)
if all(memory):
    text += ", Memory=" + format_bytes(sum(memory))
print(text)
#Workers= 2, Cores=2, Memory=8.00 GB

%time ds=xr.open_zarr('/work/ALT/odatis/eNATL60/zarr/eNATL60-BLBT02-SSH-1h')
#56.3 ms

%time mean=ds.sossheig.mean(dim='time_counter')
#195 ms
model_params = {'n_estimators': [20, 30]}

# create random forest classifier model
rf = RandomForestClassifier(random_state=1)

# set up grid search meta-estimator
clf = GridSearchCV(rf, model_params, cv=3)

# DASK - TRAIN MODEL
# ### Fit Model with Dask
from joblib import Parallel, parallel_backend
with parallel_backend('dask'):
    model = clf.fit(X_train, y_train)

## Optional - print dask cluster config

import json
print(json.dumps(client.scheduler_info(), indent=4))

## stop CDSW workers
import cdsw
cdsw.stop_workers()

# print winning set of hyperparameters
from pprint import pprint
pprint(model.best_estimator_.get_params())

# generate predictions using the best-performing model
predictions = model.predict(X_test)
print(predictions)
Beispiel #9
0
def start_dask_cluster(environment=os.path.basename(
    os.environ['CONDA_PREFIX']),
                       worker_profile='Medium Worker',
                       profile='default',
                       region='us-west-2',
                       endpoint=None,
                       worker_min=2,
                       worker_max=20,
                       adaptive_scaling=True,
                       wait_for_cluster=True,
                       cfile=None,
                       use_existing_cluster=True,
                       propagate_env=False):
    '''
    environment      - should match the kernel running, and will be set autmatically
    worker profile   - 'Small Worker', 'Medium Worker', or 'Pangeo Worker' (determines available memory in a worker)
    profile          - 'default' is good, but others can be used 
    region           - AWS region
    endpoint         - None by default matches region. Set correct endpoint to s3 buckets
    worker_min       - minumum number of workers (for adaptive scaling)
    worker_max       - maximum number of workers
    adaptive_scaling - Default True. If False, launches worker_max workers
    wait_for_cluster - Default True. 
    cfile            - None. Finds aws credentials in this file
    use_existing_cluster - Default True.
    propagate_env    - Default False. Set to True when working with Cloud VRTs
    '''
    if not endpoint:
        endpoint = f's3.{region}.amazonaws.com'

    set_credentials2(profile=profile,
                     region=region,
                     endpoint=endpoint,
                     cfile=cfile)

    try:
        gateway.list_clusters()
    except:
        gateway = Gateway()

    if gateway.list_clusters():
        print('Existing Dask clusters:')
        j = 0
        for c in gateway.list_clusters():
            print(f'Cluster Index c_idx: {j} / Name:', c.name, c.status)
            j += 1
    else:
        print('No Cluster running.')

    # TODO Check if worker_profile is the same, otherwise start new cluster
    if gateway.list_clusters() and use_existing_cluster:
        print('Using existing cluster [0].')
        cluster = gateway.connect(gateway.list_clusters()[0].name)
    else:
        print('Starting new cluster.')
        cluster = gateway.new_cluster(environment=environment,
                                      profile=worker_profile)

    if adaptive_scaling:
        print(f'Setting Adaptive Scaling min={worker_min}, max={worker_max}')
        cluster.adapt(minimum=worker_min, maximum=worker_max)
    else:
        print(f'Setting Fixed Scaling workers={worker_max}')
        cluster.scale(worker_max)

    try:
        client = Client(cluster)
        client.close()
        print('Reconnect client to clear cache')
    except:
        pass
    client = Client(cluster)

    print(
        f'client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):\n{client.dashboard_link}'
    )

    if wait_for_cluster:
        target_workers = worker_min if adaptive_scaling else worker_max
        live_workers = len(list(cluster.scheduler_info['workers']))
        t = 0
        interval = 2
        print(
            f'Elapsed time to wait for {target_workers} live workers:\n{live_workers}/{target_workers} workers - {t} seconds',
            end='')
        while not live_workers >= target_workers:
            sleep(interval)
            t += interval
            print(f'\r{live_workers}/{target_workers} workers - {t} seconds',
                  end='')
            live_workers = len(client.scheduler_info()['workers'])
        print(f'\r{live_workers}/{target_workers} workers - {t} seconds')

    # We need to propagate credentials to the workers
    #set_credentials(profile=profile,region=region,endpoint=endpoint)

    if propagate_env:
        print('Propagating environment variables to workers')

        class InitWorker(WorkerPlugin):
            name = "init_worker"

            def __init__(self, filepath=None, script=None):
                self.data = {}
                if filepath:
                    if isinstance(filepath, str):
                        filepath = [filepath]
                    for file_ in filepath:
                        with open(file_, "rb") as f:
                            filename = os.path.basename(file_)
                            self.data[filename] = f.read()
                if script:
                    filename = f"{uuid.uuid1()}.py"
                    self.data[filename] = script

            async def setup(self, worker):
                responses = await asyncio.gather(*[
                    worker.upload_file(
                        comm=None, filename=filename, data=data, load=True)
                    for filename, data in self.data.items()
                ])
                assert all(
                    len(data) == r["nbytes"]
                    for r, data in zip(responses, self.data.values()))

        script = f"""
        \rimport os
        \ros.environ["AWS_ACCESS_KEY_ID"] = "{os.getenv("AWS_ACCESS_KEY_ID")}"
        \ros.environ["AWS_SECRET_ACCESS_KEY"] = "{os.getenv("AWS_SECRET_ACCESS_KEY")}"
        \ros.environ["AWS_DEFAULT_REGION"] = "{os.getenv("AWS_DEFAULT_REGION")}"
        \ros.environ["GDAL_DISABLE_READDIR_ON_OPEN"] ="EMPTY_DIR"
        """

        plugin = InitWorker(script=script)
        client.register_worker_plugin(plugin)

    return client, cluster
Beispiel #10
0
class ProcessManager(GenericProcessManager):
  manager: "ProcessManager" = None

  @classmethod
  def getManager( cls ) -> Optional["ProcessManager"]:
      return cls.manager

  @classmethod
  def initManager( cls, serverConfiguration: Dict[str,str] ) -> "ProcessManager":
      if cls.manager is None:
          cls.manager = ProcessManager(serverConfiguration)
      return cls.manager

  def __init__( self, serverConfiguration: Dict[str,str] ):
      self.config = serverConfiguration
      self.logger =  EDASLogger.getLogger()
      self.num_wps_requests = 0
      self.scheduler_address = serverConfiguration.get("scheduler.address",None)
      self.maxworkers = serverConfiguration.get("scheduler.maxworkers", 16 )
      self.submitters = []
      self.slurm_clusters = {}
      self.active = True
      if self.scheduler_address is not None:
          if self.scheduler_address.lower().startswith("slurm"):
            scheduler_parms = self.scheduler_address.split(":")
            queue = "default" if len(scheduler_parms) < 2 else scheduler_parms[1]
            self.client = Client( self.getSlurmCluster(queue) )
          else:
            self.logger.info( "Initializing Dask-distributed cluster with scheduler address: " + self.scheduler_address )
            self.client = Client( self.scheduler_address, timeout=63 )
      else:
          nWorkers = int( self.config.get("dask.nworkers",multiprocessing.cpu_count()) )
          self.client = Client( LocalCluster( n_workers=nWorkers ) )
          self.scheduler_address = self.client.scheduler.address
          self.logger.info( f"Initializing Local Dask cluster with {nWorkers} workers,  scheduler address = {self.scheduler_address}")
          self.client.submit( lambda x: edasOpManager.buildIndices( x ), nWorkers )
      self.ncores = self.client.ncores()
      self.logger.info(f" ncores: {self.ncores}")
      self.scheduler_info = self.client.scheduler_info()
      self.workers: Dict = self.scheduler_info.pop("workers")
      self.logger.info(f" workers: {self.workers}")
      log_metrics = serverConfiguration.get("log.scheduler.metrics", False )
      if log_metrics:
        self.metricsThread =  Thread( target=self.trackMetrics )
        self.metricsThread.start()

  def getSlurmCluster( self, queue: str ):
      self.logger.info( f"Initializing Slurm cluster using queue {queue}" )
      cluster =  self.slurm_clusters.setdefault( queue, SLURMCluster() if queue == "default" else SLURMCluster( queue=queue ) )
      cluster.adapt( minimum=1, maximum=self.maxworkers, interval="2s", wait_count=500 )
      print( "CLUSTER JOB SCRIPT: " + cluster.job_script() )
      return cluster

  def getCWTMetrics(self) -> Dict:
      metrics_data = { key:{} for key in ['user_jobs_queued','user_jobs_running','wps_requests','cpu_ave','cpu_count','memory_usage','memory_available']}
      metrics = self.getProfileData()
      counts = metrics["counts"]
      workers = metrics["workers"]
      for key in ['tasks','processing','released','memory','saturated','waiting','waiting_data','unrunnable']: metrics_data['user_jobs_running'][key] = counts[key]
      for key in ['tasks', 'waiting', 'waiting_data', 'unrunnable']: metrics_data['user_jobs_queued'][key] = counts[key]
      for wId, wData in workers.items():
          worker_metrics = wData["metrics"]
          total_memory   = wData["memory_limit"]
          memory_usage = worker_metrics["memory"]
          metrics_data['memory_usage'][wId] = memory_usage
          metrics_data['memory_available'][wId] = total_memory - memory_usage
          metrics_data['cpu_count'][wId] = wData["ncores"]
          metrics_data['cpu_ave'][wId] = worker_metrics["cpu"]
      return metrics_data

  def trackMetrics(self, sleepTime=1.0 ):
      isIdle = False
      self.logger.info(f" ** TRACKING METRICS ** ")
      while self.active:
          metrics = self.getProfileData()
          counts = metrics["counts"]
          if counts['processing'] == 0:
              if not isIdle:
                self.logger.info(f" ** CLUSTER IS IDLE ** ")
                isIdle = True
          else:
              isIdle = False
              self.logger.info( f" METRICS: {metrics['counts']} " )
              workers = metrics["workers"]
              for key,value in workers.items():
                  self.logger.info( f" *** {key}: {value}" )
              self.logger.info(f" HEALTH: {self.getHealth()}")
              time.sleep( sleepTime )

  def getWorkerMetrics(self):
      metrics = {}
      wkeys = [ 'ncores', 'memory_limit', 'last_seen', 'metrics' ]
      scheduler_info = self.client.scheduler_info()
      workers: Dict = scheduler_info.get( "workers", {} )
      for iW, worker in enumerate( workers.values() ):
          metrics[f"W{iW}"] = { wkey: worker[wkey] for wkey in wkeys }
      return metrics

  def getDashboardAddress(self):
      stoks = self.scheduler_address.split(":")
      host_address = stoks[-2].strip("/")
      return f"http://{host_address}:8787"

  def getCounts(self) -> Dict:
      profile_address = f"{self.getDashboardAddress()}/json/counts.json"
      return requests.get(profile_address).json()

  def getHealth(self, mtype: str = "" ) -> str:
      profile_address = f"{self.getDashboardAddress()}/health"
      return requests.get(profile_address).text

  def getMetrics(self, mtype: str = "" ) -> Optional[Dict]:
      counts = self.getCounts()
      if counts['processing'] == 0: return None
      mtypes = mtype.split(",")
      metrics = { "counts": counts }
      if "processing" in mtypes:  metrics["processing"] = self.client.processing()
      if "profile" in mtypes:     metrics["profile"]    = self.client.profile()
      return metrics

  def getProfileData( self, mtype: str = "" ) -> Dict:
      try:
        return { "counts": self.getCounts(), "workers": self.getWorkerMetrics() }
      except Exception as err:
          self.logger.error( "Error in getProfileData")
          self.logger.error(traceback.format_exc())

      # response2: requests.Response = requests.get(tasks_address)
      # print(f"\n  ---->  Tasks Data from {tasks_address}: \n **  {response2.text} ** \n" )
      # response3: requests.Response = requests.get(workers_address)
      # print(f"\n  ---->  Workers Data from {workers_address}: \n **  {response3.text} ** \n" )

#      data = json.loads(counts)

    # (r"info/main/workers.html", Workers),
    # (r"info/worker/(.*).html", Worker),
    # (r"info/task/(.*).html", Task),
    # (r"info/main/logs.html", Logs),
    # (r"info/call-stacks/(.*).html", WorkerCallStacks),
    # (r"info/call-stack/(.*).html", TaskCallStack),
    # (r"info/logs/(.*).html", WorkerLogs),
    # (r"json/counts.json", CountsJSON),
    # (r"json/identity.json", IdentityJSON),
    # (r"json/index.html", IndexJSON),
    # (r"individual-plots.json", IndividualPlots),
    # (r"metrics", PrometheusHandler),
    # (r"health", HealthHandler),

  # "/system": systemmonitor_doc,
  # "/stealing": stealing_doc,
  # "/workers": workers_doc,
  # "/events": events_doc,
  # "/counters": counters_doc,
  # "/tasks": tasks_doc,
  # "/status": status_doc,
  # "/profile": profile_doc,
  # "/profile-server": profile_server_doc,
  # "/graph": graph_doc,
  # "/individual-task-stream": individual_task_stream_doc,
  # "/individual-progress": individual_progress_doc,
  # "/individual-graph": individual_graph_doc,
  # "/individual-profile": individual_profile_doc,
  # "/individual-profile-server": individual_profile_server_doc,
  # "/individual-nbytes": individual_nbytes_doc,
  # "/individual-nprocessing": individual_nprocessing_doc,
  # "/individual-workers": individual_workers_doc,

  def term(self):
      self.active = False
      self.client.close()

  def runProcess( self, job: Job ) -> EDASDataset:
    start_time = time.time()
    try:
        self.logger.info( f"Running workflow for requestId: {job.requestId}, scheduler: {self.scheduler_address}" )
        result = edasOpManager.buildTask( job )
        self.logger.info( "Completed EDAS workflow in time " + str(time.time()-start_time) )
        return result
    except Exception as err:
        self.logger.error( "Execution error: " + str(err))
        traceback.print_exc()


  def submitProcess(self, service: str, job: Job, resultHandler: ExecHandler):
      submitter: SubmissionThread = SubmissionThread( job, resultHandler )
      self.submitters.append( submitter )
      submitter.start()
Beispiel #11
0
    num_hipergator_workers = 120
    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           threads=1,
                           memory='4GB',
                           walltime='96:00:00',
                           death_timeout=600,
                           local_directory='/tmp/')

    print('Starting up workers')
    workers = cluster.start_workers(num_hipergator_workers)

    dask_client = Client(cluster)

    wait_time = 0
    while len(dask_client.scheduler_info()
              ['workers']) < num_hipergator_workers / 2:
        print('waiting on workers: {s} sec. so far'.format(s=wait_time))
        sleep(10)
        wait_time += 10

        # If 5 minutes goes by try adding them again
        if wait_time > 300:
            workers.extend(cluster.start_workers(1))

    print('Most workers accounted for')

    ##################################################
    # Main

    species_info = pd.read_csv(config['species_list_file'])
Beispiel #12
0
def main():

    memory_limit = 128e9
    threads_per_worker = 4
    cluster = LocalCUDACluster(memory_limit=memory_limit,
                               threads_per_worker=threads_per_worker)
    client = Client(cluster)
    sched_info = client.scheduler_info()

    print('CLIENT: {}'.format(client))
    print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2)))

    # Importing here in case RMM is used later on. Must start client prior
    # to importing cudf stuff if using RMM.
    from greenflow.dataframe_flow import (TaskSpecSchema, TaskGraph)

    # workers_names = \
    #     [iw['name'] for iw in client.scheduler_info()['workers'].values()]
    # nworkers = len(workers_names)

    _basedir = os.path.dirname(__file__)
    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    greenflow_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against

    # create_dmatrix_serially - When False on same node if not enough host RAM
    # then it's a race condition when creating the dmatrix. Make sure enough
    # host RAM otherwise set to True.
    # create_dmatrix_serially = False

    # able to do 18 with create_dmatrix_serially set to True
    part_count = 18  # the number of data files to train against
    create_dmatrix_serially = True
    # part_count = 4  # the number of data files to train against

    # Use RAPIDS Memory Manager. Seems to work fine without it.
    use_rmm = False

    # Clean up intermediate dataframes in the xgboost training task.
    delete_dataframes = True

    mortgage_run_params_dict_list = generate_mortgage_greenflow_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        greenflow_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir,
                                       'mortgage_greenflow_plugins.py')

    filter_dask_logger = False

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list,
            'client': client,
            'use_rmm': use_rmm,
            'filter_dask_logger': filter_dask_logger,
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    dxgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2**8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': 1,
        'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    dxgb_trainer_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_xgb_trainer_task_name,
        TaskSpecSchema.node_type:
        'DaskXgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'create_dmatrix_serially': create_dmatrix_serially,
            'delete_dataframes': delete_dataframes,
            'dxgb_gpu_params': dxgb_gpu_params,
            'client': client,
            'filter_dask_logger': filter_dask_logger
        },
        TaskSpecSchema.inputs:
        [MortgageTaskNames.dask_mortgage_workflow_runner_task_name],
        TaskSpecSchema.filepath:
        mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task]

    out_list = [MortgageTaskNames.dask_xgb_trainer_task_name]
    task_graph = TaskGraph(task_spec_list)
    (bst, ) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)
def random_search_cv(est,
                     X,
                     y,
                     param_dist,
                     n_iter=100,
                     cv=3,
                     client=None,
                     log2file=False,
                     log2tbx=False):
    if client is None:
        client = Client()

    if log2file or log2tbx:
        log_dir = make_logdir()
        if log2file:
            logger = make_filelogger(log_dir)
        if log2tbx:
            writer = make_summarywriter(log_dir)

    cv = check_cv(cv)
    source = source_for_param_space(param_dist)
    layout = cv_curve_and_table_for_source(source)
    h = bp.show(layout, notebook_handle=True)

    nworkers = len(client.scheduler_info()['workers'])
    param_iter = iter(ParameterSampler(param_dist, n_iter=n_iter))
    initial_params = [next(param_iter) for _ in range(nworkers)]
    initial_futures = [
        evaluate(est, X, y, params, cv, client) for params in initial_params
    ]
    params_map = dict(zip(initial_futures, initial_params))
    af = as_completed(initial_futures)

    for step, future in enumerate(af):
        mean_train_score, mean_test_score = future.result()
        source.stream({
            'index': [len(source.data['index'])],
            'mean_test_score': [mean_test_score],
            'cummax_score':
            [max(np.append(source.data['cummax_score'], mean_test_score))],
            'tstamp': [pd.datetime.fromtimestamp(time())],
            'params': [str(params_map[future])],
        })
        push_notebook(h)

        if log2file:
            logger.info('scoring',
                        extra={
                            'mean_test_score': mean_test_score,
                            'mean_train_score': mean_train_score,
                            'tstamp': time(),
                            'params': params_map[future],
                        })

        if log2tbx:
            writer.add_scalars(
                'search-results/cv-curve', {
                    'mean_test_score':
                    np.array(mean_test_score),
                    'cummax_score':
                    np.array(
                        max(
                            np.append(source.data['cummax_score'],
                                      mean_test_score))),
                }, step)

        try:
            params = next(param_iter)
            f = evaluate(est, X, y, params, cv, client)
            params_map[f] = params
            af.add(f)
        except StopIteration:
            pass
Beispiel #14
0
class LargeELMRegressor(BasicELM, RegressorMixin):
    """ELM Regressor for larger-than-memory problems.

    Uses `Dask <https://dask.org>`_ for batch analysis of data in Parquet files.

    .. attention:: Why do I need Parquet files?

        Parquet files provide necessary information about the data without loading whole file content from
        disk. It makes a tremendous runtime difference compared to simpler `.csv` or `.json` file formats.
        Reading from files saves memory by loading data in small chunks, supporting arbitrary large input files.
        It also solves current memory leaks with Numpy matrix inputs in Dask.

        Any data format can be easily converted to Parquet, see `Analytical methods <techniques.html>`_ section.

        HDF5 is almost as good as Parquet, but performs worse with Dask due to internal data layout.

    .. todo: Write converters.

    .. todo: Memo about number of workers: one is good, several cover disk read latency but need more memory.
        On one machine matrix operators always run in parallel, do not benefit from Dask.

    .. todo: Memory consumption with large number of neurons - 100,000 neurons require 200GB or swap space, with
        read+write reaching 1GB/s. Suggested a fast SSD, or HDD + extra workers to hide swap latency.
        Mention that Dask is not the perfect solution, kept here for future updates. And it actually solves
        stuff larger than memory, albeit at a very high time+swap cost.

    .. todo: Avoid large batch sizes as workers can fail, safe bet is 2000-5000 range.

    .. todo: Fast HtH and in-place Cholesky solver.

    .. todo: Pro tip in documentation: run ELM with dummy 1000 data samples and 1e+9 regularization,
        This will test possible memory issues for workers without wasting your time on computing full HH.

    .. todo: Option to keep full HH permanently somewhere at disk. Saves before the final step,
        avoids failures from memory issues during Cholesky solver.

    .. todo: GPU + batch Cholesky solver, for both ELM and LargeELM.

    Requirements
    ------------
        * Pandas
        * pyarrow
        * python-snappy

    Parameters
    ----------

    batch_size : int
        Batch size used for both data samples and hidden neurons. With batch Cholesky solver, allows for very large
        numbers of hidden neurons of over 100,000; limited only by the computation time and disk swap space.

        .. hint:: Include bias and original features for best performance.

        ELM will include a bias term (1 extra feature), and the original features with `include_original_features=True`.
        For optimal performance, choose `batch_size` to be equal or evenly divide the
        `n_neurons + 1 (bias) + n_inputs (if include_original_features=True)`.

        .. todo:: Exact batch_size vs. GPU performance
    """
    def __del__(self):
        if hasattr(self, 'client_'):
            self.client_.close()
            self.cluster_.close()

    def _setup_dask_client(self):
        self.cluster_ = LocalCluster(
            n_workers=4,
            threads_per_worker=1,
            local_dir="/Users/akusok/wrkdir/dask-temp",
            memory_limit="8GB")
        self.client_ = Client(self.cluster_)

        W_list = [hl.projection_.components_ for hl in self.hidden_layers_]
        W_dask = [da.from_array(_dense(W), chunks=self.bsize_) for W in W_list]
        self.W_ = self.client_.persist(W_dask)

        def foo():
            import os
            os.environ['OMP_NUM_THREADS'] = '1'

        self.client_.run(foo)

        print("Running on:", self.client_)

        try:
            dashboard = self.client_.scheduler_info()['address'].split(":")
            dashboard[0] = "http"
            dashboard[-1] = str(
                self.client_.scheduler_info()['services']['dashboard'])
            print("Dashboard at", ":".join(dashboard))
        except:
            pass

    def _project(self, X_dask):
        """Compute hidden layer output with Dask functionality.
        """
        H_list = []
        for hl, W in zip(self.hidden_layers_, self.W_):
            if hl.hidden_layer_ == HiddenLayerType.PAIRWISE:
                H0 = X_dask.map_blocks(pairwise_distances,
                                       W,
                                       dtype=X_dask.dtype,
                                       chunks=(X_dask.chunks[0],
                                               (W.shape[0], )),
                                       metric=hl.pairwise_metric)
            else:
                XW_dask = da.dot(X_dask, W.transpose())
                if hl.ufunc_ is dummy:
                    H0 = XW_dask
                elif hl.ufunc_ is np.tanh:
                    H0 = da.tanh(XW_dask)
                else:
                    H0 = XW_dask.map_blocks(hl.ufunc_)
            H_list.append(H0)

        if self.include_original_features:
            H_list.append(X_dask)
        H_list.append(da.ones((X_dask.shape[0], 1)))

        H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_)
        return H_dask

    def _compute(self, X, y, sync_every, HH=None, HY=None):
        """Computing matrices HH and HY, the actually long part.

        .. todo: actually distributed computations that scatter batches of data file names,
            and reduce-sum the HH,HY matrices.
        """

        # processing files
        for i, X_file, y_file in zip(range(len(X)), X, y):
            X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True)
            Y_dask = dd.read_parquet(y_file).to_dask_array(lengths=True)
            H_dask = self._project(X_dask)

            if HH is None:  # first iteration
                HH = da.dot(H_dask.transpose(), H_dask)
                HY = da.dot(H_dask.transpose(), Y_dask)
            else:
                HH += da.dot(H_dask.transpose(), H_dask)
                HY += da.dot(H_dask.transpose(), Y_dask)
                if sync_every is not None and i % sync_every == 0:
                    wait([HH, HY])

            # synchronization
            if sync_every is not None and i % sync_every == 0:
                HH, HY = self.client_.persist([HH, HY])

        # finishing solution
        if sync_every is not None:
            wait([HH, HY])
        return HH, HY

    def _solve(self, HH, HY):
        """Compute output weights from HH and HY using Dask functionality.
        """
        # make HH/HY divisible by chunk size
        n_features, _ = HH.shape
        padding = 0
        if n_features > self.bsize_ and n_features % self.bsize_ > 0:
            print("Adjusting batch size {} to n_features {}".format(
                self.bsize_, n_features))
            padding = self.bsize_ - (n_features % self.bsize_)
            P01 = da.zeros((n_features, padding))
            P10 = da.zeros((padding, n_features))
            P11 = da.zeros((padding, padding))
            HH = da.block([[HH, P01], [P10, P11]])

            P1 = da.zeros((padding, HY.shape[1]))
            HY = da.block([[HY], [P1]])

        # rechunk, add bias, and solve
        HH = HH.rechunk(
            self.bsize_) + self.alpha * da.eye(HH.shape[1], chunks=self.bsize_)
        HY = HY.rechunk(self.bsize_)

        B = da.linalg.solve(HH, HY, sym_pos=True)
        if padding > 0:
            B = B[:n_features]

        return B

    def fit(self, X, y=None, sync_every=10):
        """Fits an ELM with data in a bunch of files.

        Model will use the set of features from the first file.
        Same features must have same names across the whole dataset.

        .. todo: Check what happens if features are in different order or missing.

        Does **not** support sparse data.

        .. todo: Check if some sparse data would work.

        .. todo: Check that sync_every does not affect results

        .. todo: Add single precision

        .. todo: Parquet file format examples in documentation

        Original features and bias are added to the end of data, for easier rechunk-merge. This way full chunks
        of hidden neuron outputs stay intact.


        Parameters
        ----------

        X : [str]
            List of input data files in Parquet format.

        y : [str]
            List of target data files in Parquet format.

        sync_every : int or None
            Synchronize computations after this many files are processed. None for running without synchronization.
            Less synchronization improves run speed with smaller data files, but may result in large swap space usage
            for large data problems. Use smaller number for more frequent synchronization if swap space
            becomes a problem.
        """

        if not _is_list_of_strings(X) or not _is_list_of_strings(y):
            raise ValueError("Expected X and y as lists of file names.")

        if len(X) != len(y):
            raise ValueError(
                "Expected X and y as lists of files with the same length. "
                "Got len(X)={} and len(y)={}".format(len(X), len(y)))

        # read first file and get parameters
        X_dask = dd.read_parquet(X[0]).to_dask_array(lengths=True)
        Y_dask = dd.read_parquet(y[0]).to_dask_array(lengths=True)

        n_samples, n_features = X_dask.shape
        if hasattr(self, 'n_features_') and self.n_features_ != n_features:
            raise ValueError(
                'Shape of input is different from what was seen in `fit`')

        _, n_outputs = Y_dask.shape
        if hasattr(self, 'n_outputs_') and self.n_outputs_ != n_outputs:
            raise ValueError(
                'Shape of outputs is different from what was seen in `fit`')

        # set batch size, default is bsize=2000 or all-at-once with less than 10_000 samples
        self.bsize_ = self.batch_size
        if self.bsize_ is None:
            self.bsize_ = n_samples if n_samples < 10 * 1000 else 2000

        # init model if not fit yet
        if not hasattr(self, 'hidden_layers_'):
            self.n_features_ = n_features
            self.n_outputs_ = n_outputs

            X_sample = X_dask[:10].compute()
            self._init_hidden_layers(X_sample)
            self._setup_dask_client()

        HH, HY = self._compute(X, y, sync_every=sync_every)
        self.B = self._solve(HH, HY)
        self.is_fitted_ = True
        return self

    def predict(self, X):
        """Prediction works with both lists of Parquet files and numeric arrays.

        Parameters
        ----------

        X : array-like, [str]
            Input data as list of Parquet files, or as a numeric array.

        Returns
        -------
        Yh : array, shape (n_samples, n_outputs)
            Predicted values for all input samples.

            .. attention:: Returns all outputs as a single in-memory array!

                Danger of running out out memory for high-dimensional outputs, if a large set of input
                files is provided. Feed data in smaller batches in such case.
        """
        check_is_fitted(self, 'is_fitted_')

        if _is_list_of_strings(X):
            Yh_list = []

            # processing files
            for X_file in X:
                X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True)
                H_dask = self._project(X_dask)
                Yh_list.append(da.dot(H_dask, self.B))

            Yh_dask = da.concatenate(Yh_list, axis=0)
            return Yh_dask.compute()

        else:
            X = check_array(X, accept_sparse=True)
            H = [np.ones((X.shape[0], 1))]
            if self.include_original_features:
                H.append(_dense(X))
            H.extend([hl.transform(X) for hl in self.hidden_layers_])

            return np.hstack(H) @ self.B.compute()
Beispiel #15
0
zone = sys.argv[3]
expired = int(sys.argv[4])

with open('post.pkl', 'rb') as fopen:
    post_message = cloudpickle.load(fopen)

while True:
    try:
        client = Client('dask:8786')
        break
    except:
        time.sleep(5)

now = datetime.now()
while True:
    workers = client.scheduler_info()['workers']
    if any([v['metrics']['executing'] != 0 for k, v in workers.items()]):
        now = datetime.now()

    if (datetime.now() - now).seconds > expired:
        slack_msg = """
            Gracefully deleted Dask cluster. 
            *Time shutdown*: {exec_date}
            *Dask cluster name*: {dask_name}
            """.format(exec_date=str(datetime.now()), dask_name=name)
        post_message(slack_msg)
        compute = googleapiclient.discovery.build('compute', 'v1')
        compute.instances().delete(project=project, zone=zone,
                                   instance=name).execute()
        break
Beispiel #16
0
    def setup_logging(self):
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)

    def convert_to_toz(self):
        scenario = "ssp585"
        baseurl = "../oceanography/cmip6/ozone-absorption/"
        ds = self.get_input4mpis_forcing(scenario, baseurl)
        self.convert_vmro3_to_toz(scenario, ds, baseurl)


def main():
    ozone = CMIP6_ozone()
    ozone.setup_logging()
    ozone.convert_to_toz()


if __name__ == '__main__':
    np.warnings.filterwarnings('ignore')
    # https://docs.dask.org/en/latest/diagnostics-distributed.html
    from dask.distributed import Client

    dask.config.set(scheduler='processes')

    client = Client()
    status = client.scheduler_info()['services']
    print("Dask started with status at: http://localhost:{}/status".format(
        status["dashboard"]))
    print(client)
    main()
Beispiel #17
0
class Adaptor:
    adr = ""
    client = None
    workers = []
    queues = []

    def __init__(self, Sworker, scheduler_info):
        with open(scheduler_info) as f:
            s = json.load(f)
        self.adr = s["address"]
        self.client = Client(self.adr, serializers=[
            'dask', 'pickle'
        ])  # msgpack pour grand message ne serialize pas
        dask.config.set({
            "distributed.deploy.lost-worker-timeout": 60,
            "distributed.workers.memory.spill": 0.97,
            "distributed.workers.memory.target": 0.95,
            "distributed.workers.memory.terminate": 0.99
        })
        self.workers = [
            comm.get_address_host_port(i, strict=False)
            for i in self.client.scheduler_info()["workers"].keys()
        ]
        while (len(self.workers) != Sworker):
            self.workers = [
                comm.get_address_host_port(i, strict=False)
                for i in self.client.scheduler_info()["workers"].keys()
            ]
        Variable("workers").set(self.workers)

    def create_array(self, name, shape, chunksize, dtype, timedim):
        chunks_in_each_dim = [
            shape[i] // chunksize[i] for i in range(len(shape))
        ]
        l = list(itertools.product(*[range(i) for i in chunks_in_each_dim]))
        items = []
        for m in l:
            f = Future(key=("deisa-" + name, m), inform=True, deisa=True)
            d = da.from_delayed(dask.delayed(f), shape=chunksize, dtype=dtype)
            items.append([list(m), d])
        ll = self.array_sort(items)
        arrays = da.block(ll)
        return arrays

    def create_array_list(self, name, shape, chunksize, dtype,
                          timedim):  #list arrays, one for each time step.
        chunks_in_each_dim = [
            shape[i] // chunksize[i] for i in range(len(shape))
        ]
        l = list(itertools.product(*[range(i) for i in chunks_in_each_dim]))
        items = []
        for m in l:
            f = Future(key=("deisa-" + name, m), inform=True, deisa=True)
            d = da.from_delayed(dask.delayed(f), shape=chunksize, dtype=dtype)
            items.append([list(m), d])
        ll = self.array_sort(items)
        for i in ll:
            arrays.append(da.block(i))
        return arrays

    def array_sort(self, ListDs):
        if len(ListDs[0][0]) == 0:
            return ListDs[0][1]
        else:
            dico = dict()
            for e in ListDs:
                dico.setdefault(e[0][0], []).append([e[0][1:], e[1]])
            return [self.array_sort(dico[k]) for k in sorted(dico.keys())]

    def get_data(self, as_list=False):
        arrays = dict()
        self.arrays_desc = Queue("Arrays").get()
        for name in self.arrays_desc:
            if not as_list:
                arrays[name] = self.create_array(
                    name, self.arrays_desc[name]["sizes"],
                    self.arrays_desc[name]["subsizes"],
                    self.arrays_desc[name]["dtype"],
                    self.arrays_desc[name]["timedim"])
            else:  #TODO test this
                arrays[name] = self.create_array_list(
                    name, self.arrays_desc[name]["sizes"],
                    self.arrays_desc[name]["subsizes"],
                    self.arrays_desc[name]["dtype"],
                    self.arrays_desc[name]["timedim"])
        #Barrier after the creation of all the dask arrays
        e = Event("Done")
        e.set()
        return arrays
Beispiel #18
0
# for zarr storage
os.environ['TMPDIR'] = '/blue/adamginsburg/adamginsburg/tmp'

if __name__ == "__main__":
    # need to be in main block for dask to work
    from dask.distributed import Client
    if os.getenv('SLURM_MEM_PER_NODE'):
        memlim_total = int(os.getenv('SLURM_MEM_PER_NODE')) / 1024 # GB
        ntasks = int(os.getenv('SLURM_NTASKS'))
        memlim = memlim_total / ntasks
        print(f"Memory limit is {memlim} GB")
    else:
        memlim = 1
        ntasks = 8
    client = Client(memory_limit=f'{memlim}GB', n_workers=ntasks)
    nworkers = len(client.scheduler_info()['workers'])
    print(f"Client scheduler info: {client.scheduler_info()['services']}")
    print(f"Number of workers: {nworkers}  (should be equal to ntasks={ntasks})")
    print(f"Client scheduler info: {client.scheduler_info()}")
    print(f"Client vers: {client.get_versions(check=True)}")
    if os.getenv('ENVIRONMENT') == 'BATCH':
        pass
    else:
        from dask.diagnostics import ProgressBar
        pbar = ProgressBar()
        pbar.register()

    assert tempfile.gettempdir() == '/blue/adamginsburg/adamginsburg/tmp'

    basepath = '/orange/adamginsburg/ALMA_IMF/2017.1.01355.L/imaging_results'
Beispiel #19
0
# Setup dask cluster
######################################################
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
from dask import delayed
import dask
cluster = SLURMCluster(processes=1,queue='hpg2-compute', cores=1, memory='10GB', walltime='96:00:00',
                       job_extra=['--qos ewhite-b'],
                       death_timeout=600, local_directory='/tmp/', interface='ib0')

print('Starting up workers')
workers = cluster.start_workers(hindcast_config.num_hipergator_workers)
dask_client = Client(cluster)

wait_time=0
while len(dask_client.scheduler_info()['workers']) < hindcast_config.num_hipergator_workers:
    print('waiting on workers: {s} sec. so far'.format(s=wait_time))
    sleep(10)
    wait_time+=10
    
    # If 5 minutes goes by try adding them again
    if wait_time > 300:
        workers.extend(cluster.start_workers(1))

print('All workers accounted for')
# xr import must be after dask.array, and I think after setup
# up the cluster/client. 
import dask.array as da
import xarray as xr

Beispiel #20
0
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)

start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

if __name__ == "__main__":
    # client = Client()
    client = Client(processes=True,
                    n_workers=2,
                    threads_per_worker=1,
                    memory_limit='12GB')

    print(client.scheduler_info()['services'])
    logger.info("client ready at ... {} ... at {}".format(
        client.scheduler_info()['services'],
        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    soilgrids = {
        'sand': 'soil250_grid_sand_sd',
        'silt': 'soil250_grid_silt_sd',
        'clay': 'soil250_grid_clay_sd',
        'rock': 'soil250_grid_coarsefrag_sd',
        'bd': 'soil250_grid_bulkdens_sd',
        'soc': 'soil250_grid_soc_sd',
        'awc': 'soil250_grid_awc_sd',
        'k_sat': 'soil250_grid_k_sat_sd'
    }
Beispiel #21
0
def launch_python_post():
	curDir = os.path.dirname(os.path.abspath(__file__)) 
	logger = PyPostTools.pyPostLogger()

	logger.write("Initializing WRF Python Post-Processing Program")
	#Step 1: Load program settings
	logger.write(" 1. Application Initalization")
	logger.write("  - Loading control file, python_post_control.txt")
	_pySet = PyPostSettings.PyPostSettings()
	logger.write("  - Success!")
	logger.write("  - Testing Environmental Variables")
	try:
		dask_nodes = os.environ["PYTHON_POST_NODES"]
		dask_threads = os.environ["PYTHON_POST_THREADS"]	
		postDir = os.environ["PYTHON_POST_DIR"]
		targetDir = os.environ["PYTHON_POST_TARG_DIR"]
	except KeyError:
		logger.write("***FAIL*** KeyError encountered while trying to access important environmental variables, abort.")
		sys.exit("")
	logger.write("  - Success!")
	logger.write("  - Initializing Dask (" + str(dask_nodes) + " Nodes Requested), Collecting routines needed")
	_routines = Routines.Routines()
	logger.write("   - Async IO Loop initialized...")	
	def f(scheduler_port):
		async def g(port):
			s = Scheduler(port=scheduler_port)
			await s
			await s.finished()
		asyncio.get_event_loop().run_until_complete(g(scheduler_port))
	# Starts the scheduler in its own process - needed as otherwise it will 
	# occupy the program and make it do an infinite loop
	process = Process(target=f, args=(scheduler_port,))
	process.start()
	logger.write("   - Dask Scheduler initialized (Port " + str(scheduler_port) + ")...")
	try:
		dask_client = Client("tcp://" + socket.gethostname() + ":" + str(scheduler_port), timeout=30)
	except OSError:
		logger.write("  <-> Dask Client could not be created, timeout error.")
		process.terminate()
		sys.exit()
	logger.write("   - Dask Client initialized...")
	logger.write("   - Writing Dask Worker Job Files...")
	with PyPostTools.cd(targetDir):
		writeFile = PyPostTools.write_job_file(socket.gethostname(), scheduler_port, project_name="climate_severe", queue="debug-cache-quad", nodes=dask_nodes, wall_time=60, nProcs=1)
		if(writeFile == False):
			dask_client.close()
			logger.write("   - Failed to write job file, are you missing an important parameter?")
			sys.exit("")
			return
		else:
			logger.write("   - Dask Worker Job File Written, Submitting to Queue.")
			PyPostTools.popen("chmod +x dask-worker.job")
			PyPostTools.popen("qsub dask-worker.job")
	# Wait here for workers.
	logger.write("   -> Worker Job submitted to queue, waiting for workers...")
	while len(dask_client.scheduler_info()['workers']) < int(dask_nodes):
		time.sleep(2)
	logger.write("   -> Workers are now connected.")
	logger.write("  - Success!")
	logger.write(" 1. Done.")
	logger.write(" 2. Start Post-Processing Calculations")
	start_calculations(dask_client, _routines, dask_threads, process)
	logger.write(" 2. Done.")
	logger.write(" 3. Generating Figures")
	logger.write("  - Collecting files from target directory (" + targetDir + ").")
	fList3 = sorted(glob.glob(targetDir + "WRFPRS_F*"))
	logger.write("  - " + str(len(fList3)) + " files have been found.")
	logger.write(" -> Pushing run_plotting_routines() to dask.")
	fullDict = _pySet.get_full_dict()
	plotting_future = start_plotting(dask_client, fullDict, dask_threads, process)
	wait(plotting_future)
	result_plot = dask_client.gather(plotting_future)[0]
	if(result_plot != 0):
		logger.write("***FAIL*** An error occured in plotting method, check worker logs for more info.")
		logger.close()
		sys.exit("")	
	logger.write(" 3. Done.")
	logger.write(" 4. Final Steps")
	
	logger.write(" 4. Done, Closing Dask Client.")
	# Close the client object
	dask_client.retire_workers(workers=dask_client.scheduler_info()['workers'], close=True)
	dask_client.close()	
	logger.write("All Steps Completed.")
	logger.write("***SUCCESS*** Program execution complete.")
	logger.close()
	del dask_client
	process.terminate()
from dask_mpi import initialize

initialize()

import socket

from distributed.scheduler import logger

import dask.array as da
from dask.distributed import Client

client = Client()  # Connect this local process to remote workers

host = client.run_on_scheduler(socket.gethostname)
port = client.scheduler_info()['services']['dashboard']
login_node_address = (
    'supercomputer.university.edu'  # Change this to the address/domain of your login node
)

logger.info(f'ssh -N -L {port}:{host}:{port} {login_node_address}')

logger.info('HELLO' * 10)
print('WORLD' * 10)

x = da.random.random((200, 10_000, 5_000), chunks=(20, 1_000, 1_000))
y = x.std(axis=0)
y = y.compute()
print(y)
    name='make_profiles',
    walltime='00:30:00',
    job_extra=['--constraint=HSW24', '--exclusive', '--nodes=1'],
    memory='120GB',
    interface='ib0')
cluster.scale(196)
cluster

from dask.distributed import Client
client = Client(cluster)
client

import time
nb_workers = 0
while True:
    nb_workers = len(client.scheduler_info()["workers"])
    if nb_workers >= 2:
        break
    time.sleep(1)
print(nb_workers)

import sys, glob
import numpy as np
import xarray as xr
sys.path.insert(0, "/scratch/cnt0024/hmg2840/albert7a/DEV/git/xscale")

import xscale.spectral.fft as xfft
import xscale
import Wavenum_freq_spec_func as wfs
import time
from dask.distributed import Client
import dask.array as da
from time import sleep
"""
A perpetual dask scheduler to test dask-top
Will print out the scheduler address and run 
jobs every few seconds until Ctl+c
"""


def sqrt(x):
    return x**0.5


if __name__ == '__main__':
    client = Client(n_workers=2,
                    nthreads=1,
                    memory_limit='512mb',
                    dashboard_address=8787)
    print(client.scheduler_info())
    sleep(3)
    while True:
        x = client.scatter(da.random.random((1000, 1000), chunks=(50, 50)))
        _ = client.submit(sqrt, x).result().compute()
        sleep(3)