class MyDaskClient():
    def __init__(self, address=None):
        self._client = Client(address)

    def _who_has(self, key):
        who_has_dict = self._client.who_has()
        if key in who_has_dict:
            return {"key": key, "worker": who_has_dict[key]}

    def get_status(self, key):
        # first we check if a worker has it
        processing_dict = self._client.processing()
        for worker in processing_dict.keys():
            if key in processing_dict[worker]:
                return {"status": "running", "worker": worker}
        # then we check if the task is in the stream
        for task in reversed(self._client.get_task_stream()):
            if task["key"] == key:
                return {"status": "done", "dask_status": task["status"]}
Beispiel #2
0
class DaskSensor(Plugin):
    def __init__(self, dask_address, **kwargs):
        super(DaskSensor, self).__init__(**kwargs)
        self.client = Client(address=dask_address)

        self.currentValue = {
            'Memory': {
                'total_memory': 0,
                'used_memory': 0
            },
            'CPU': {
                'cpu_usage': 0
            },
            'Cluster': {
                'n_workers': 0,
                'total_threads': 0
            },
            'Workers': []
        }

    def close(self):
        self.client.close()

    def update(self):
        self.worker_info = self.client.scheduler_info()['workers']
        self.currentValue['Memory']['total_memory'] = round(
            self.available_memory() / (1024**2), 2)
        self.currentValue['Memory']['used_memory'] = round(
            self.used_memory() / (1024**2), 2)
        self.currentValue['Memory'][
            'used_memory_percent'] = self.currentValue['Memory'][
                'used_memory'] / self.currentValue['Memory']['total_memory']
        self.currentValue['CPU']['cpu_usage'] = self.cpu_usage()
        self.currentValue['Cluster']['n_workers'] = self.num_workers()
        self.currentValue['Cluster']['total_threads'] = self.num_workers()
        self.currentValue['Workers'] = self.get_worker_stats()

    def num_workers(self):
        return len(self.worker_info)

    def num_threads(self):
        threads = [
            worker['nthreads'] for _, worker in self.worker_info.items()
        ]
        return (sum(threads))

    def available_memory(self):
        tots = 0
        for w, info in self.worker_info.items():
            tots += info['memory_limit']
        return tots

    def used_memory(self):
        tots = 0
        for w, info in self.worker_info.items():
            tots += info['metrics']['memory']
        return tots

    def get_worker_stats(self):
        worker_stats = []
        for w, info in self.worker_info.items():
            stats = {
                'user': '******',
                'id': 'filler',
                'name': 'filler',
                'rawtime': 1,
                'time': 1,
                'command': '',
                'cpu': 1,
                'memory': 1,
                'local_ports': 'filler'
            }
            stats['address'] = w
            stats['nthreads'] = info['nthreads']
            stats['memory'] = round(info['metrics']['memory'] / (1024**2), 2)
            stats['memory_limit'] = round(info['memory_limit'] / (1024**2), 2)
            stats['cpu'] = info['metrics']['cpu']
            stats['read'] = round(info['metrics']['read_bytes'] / (1024**2), 2)
            stats['write'] = round(info['metrics']['write_bytes'] / (1024**2),
                                   2)

            worker_stats.append(stats)
        return worker_stats

    def cpu_usage(self):
        usages = []
        for w, info in self.worker_info.items():
            usages.append(info['metrics']['cpu'])
        if len(usages) > 0:
            return sum(usages) / len(usages)
        else:
            return 0

    def task_status(self):
        tasks = self.client.get_task_stream(start=time() - 10)
        task_keys = [key_split(t['key']) for t in tasks]
        task_counts = dict()
        for k in task_keys:
            if k not in task_counts:
                task_counts[k] = 0
            task_counts[k] += 1
        return task_counts
Beispiel #3
0
    mus = uproot_methods.TLorentzVectorArray.from_ptetaphim(
        *t.arrays(["Muon_pt","Muon_eta","Muon_phi","Muon_mass"],**extra)
    )
    mus = mus[mus.counts==2]
    mll = (mus[:,0]+mus[:,1]).mass
    bins = np.logspace(np.log10(0.5),np.log10(1000),num=300)
    counts,_ = np.histogram(np.clip(mll,bins[0],bins[-1]),bins=bins)
    return counts


# clear array_cache
workers = list(c.scheduler_info()["workers"].keys())
c.gather(c.map(lambda x: get_worker().array_cache.clear(),workers,workers=workers))

# start
c.get_task_stream()
# print(get_mll_hist(chunks[0]))
t0 = time.time()
futures = c.map(get_mll_hist,chunks)
results = c.gather(futures)
t1 = time.time()
print(len(results),"results")
print(t1-t0)
task_stream = c.get_task_stream(start=t0,stop=t1)
print("task_stream length",len(task_stream))
pd.DataFrame(task_stream).drop("type",axis=1).to_json("data/dask_cold_{}.json".format(trial))

d = c.who_has(futures)
# chunk_workers = list(zip(chunks,[d[f.key] for f in futures]))
workers = [d[f.key][0] for f in futures]
print(workers)