def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostics_port=None, loop=loop, start=False) cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.worker_info: yield gen.sleep(0.01) assert time() < start + 15 yield c._gather(futures) del futures start = time() while cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 assert not cluster.workers yield gen.sleep(0.2) assert not cluster.workers futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) yield c._shutdown() yield cluster._close()
def main(_): #Generate scheduler data = da.from_array(np.array(Image.open(r'dota2.jpg')), chunks=(600, 400, 3)) client = Client(args.address) client.upload_file('calcov.py') temp3 = np.zeros((3, 3)) temp3[0, :] = [0.062467, 0.125000, 0.062467] temp3[1, :] = [0.125000, 0.250131, 0.125000] temp3[2, :] = [0.062467, 0.125000, 0.062467] D = [] B = [] for i in range(args.queue): D.append(np.array(data + i * 10)) B.append(temp3 + 0.05) future = client.map(calcov.calCov, B, D) result = [[np.array(_[0]), str(_[1]), str(_[2])] for _ in client.gather(future)] shutil.rmtree(r'./data', ignore_errors=True) os.mkdir(r'./data') i = 0 for _ in result: data = _[0] time = _[1] name = _[2].strip('tcp://') new_im = Image.fromarray(data) new_im.save('./data/result_%s_%s_(%s).jpg' % (i, time, name)) i += 1
def main(): #define parallel mcmc wrapper def parallel_mcmc(_): return (mcmc(initial_parameters=epa_0, proposer=normal_prop, param2res=param2res, costfunction=costfunction, nsimu=5000)) #check jobs resources to initialize dask workers num_threads = int( environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1))) initialize(interface='ib0', nthreads=num_threads) client = Client() #run 10 chains [[c_form1, j_form1], [c_form2, j_form2], [c_form3, j_form3], [c_form4, j_form4], [c_form5, j_form5], [c_form6, j_form6], [c_form7, j_form7], [c_form8, j_form8], [c_form9, j_form9], [c_form10, j_form10]] = client.gather(client.map(parallel_mcmc, range(0, 10))) #print chain5 output as test formal_c_path = dataPath.joinpath('chain5_pmcmc_c.csv') formal_j_path = dataPath.joinpath('chain5_pmcmc_j.csv') pd.DataFrame(c_form5).to_csv(formal_c_path, sep=',') pd.DataFrame(j_form5).to_csv(formal_j_path, sep=',')
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. Parameters ---------- n_workers : int How many workers should the local dask cluster have? """ super().__init__() import tempfile from distributed import Client, LocalCluster # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine Parameters ---------- func : Callable Function to send to each worker. partitioned_chunks : List List of data chunks, each chunk is processed by one woker kwargs : Dict Parameters for the map function Returns ------- List The result of the calculation as a list - each item should be the result of the application of func to a single element. """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return result def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
def main(args=None): args = parse_args(args) client = Client(args.scheduler) keys = [ f'nyc-tlc/trip data/yellow_tripdata_2009-{m:0>2}.csv' for m in range(1, 13) ] results = client.map(fetch, keys) wait(results)
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features :param address: the ip address and port number of the Dask Scheduler :type address: str """ from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. :param data_length: A length which defines how many calculations there need to be. :type data_length: int """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features :param address: the ip address and port number of the Dask Scheduler :type address: str """ from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. :param data_length: A length which defines how many calculations there need to be. :type data_length: int """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
def main(): #get command line arguments controling launch threads = 1 workers = 8 for x in sys.argv[1:]: if x.find("threads") > -1: z = x.split("=") threads = int(z[1]) if x.find("workers") > -1: z = x.split("=") workers = int(z[1]) # launch with either threads and/or workers specified (0 = default) if threads == 0 and workers != 0: print("lanching %d workers, default threads" % (workers)) cluster = LocalCluster(n_workers=workers) if threads != 0 and workers == 0: print("lanching %d threads, defalut workers" % (threads)) cluster = LocalCluster(threads_per_worker=threads) if threads != 0 and workers != 0: print("lanching %d workers with %d threads" % (workers, threads)) cluster = LocalCluster(n_workers=workers, threads_per_worker=threads) print(cluster) client = Client(cluster) print(client) # do serial # NOTE: it is possible to launch an asynchronous client # but here we just do serial synchronous. See: # https://distributed.dask.org/en/latest/asynchronous.html result = [] print(" pid Start T") for i in range(0, 5): j = 2 result.append(client.submit(test, i, j).result()) print(result) print(Counter(result)) #do parallel n = 15 np.random.seed(1234) x = np.random.random(n) * 20 #set to uniform nonzero to get uniform run times for each task x = np.ones(n) * 10 print(x) print(" pid Start T") L = client.map(test, range(n), x) mylist = client.gather(L) pids = [] for m in mylist: x = m.split()[0] pids.append(x) print(m) pids = sorted(set(pids)) print(len(pids), pids)
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client import tempfile # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iteratables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, processes=False, diagnostics_port=None, loop=loop, start=False) cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.worker_info: yield gen.sleep(0.01) assert time() < start + 15 yield c._gather(futures) del futures start = time() while cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 assert not cluster.workers assert not cluster.scheduler.workers yield gen.sleep(0.2) assert not cluster.workers assert not cluster.scheduler.workers futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) yield c._shutdown() yield cluster._close()
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client import tempfile # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostic_port=None, loop=loop, start=False) alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() for i in range(20): futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) del futures yield gen.sleep(0.1) yield c._shutdown() yield cluster._close()
class Sender(sender.BatchSender): def __init__(self, *args, **kwargs): super(Sender, self).__init__(*args, **kwargs) self.client = None self.parser = None self.sections = kwargs.get('sections', 10) def send(self, event): self._queue.put(event.raw_data) if self._queue.qsize() >= self._flush_size: self.need_flush.set() def catch(self, agent): super(Sender, self).catch(agent) args = agent.client.args kwargs = agent.client.kwargs self.client = Client(*args, **kwargs) self.parser = agent.real_parser def push(self): ret = False if not self._buffers: self._buffers.append([]) for i in range(self._max_batch_size): if not self._queue.empty(): self._buffers[-1].append(self._queue.get()) if len(self._buffers[-1]) >= int( self._max_batch_size / self.sections): self._buffers.append([]) else: break if self._buffers: pmap = partial(Parser.map, parser=self.parser) buffers = self.client.map(pmap, self._buffers) buffers = self.client.submit(Parser.reduce, buffers).result() if hasattr(self._output, 'sendmany'): self._output.sendmany(buffers) else: for event in buffers: self._output.send(event) self._buffers = [] if self._queue.qsize() < self._flush_size: ret = True return ret
def test_use_with_dask(): try: import dask import dask.distributed from distributed import Client except ImportError: import warnings warnings.warn("Dask and/or Distributed are not installed") return with open(f"{CURRENT_DIR}/test-ogusa-remote.json") as f: remote_outputs = json.loads(f.read()) outputs = cs_storage.read(remote_outputs["outputs"]) c = Client() futures = c.map(cs_storage.screenshot, outputs["renderable"]) results = c.gather(futures) for result in results: assert isinstance(result, bytes)
def main(): from argparse import ArgumentParser parser = ArgumentParser() #parser.add_argument('min_num', type=int) #parser.add_argument('max_num', type=int) args = parser.parse_args() num_threads = int( environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1))) initialize(interface='ib0', nthreads=num_threads) client = Client() min_num = 10 max_num = 100 start_time = datetime.now() num_primes = sum( client.gather(client.map(slow_is_prime, range(min_num, max_num + 1)))) end_time = datetime.now() print(f'{num_primes} primes between {min_num} and {max_num} ' f'[{end_time - start_time}]')
class DaskClient(Thread): def __init__(self, clientUrl, clientId, daqObjectGenerator, resultQ): Thread.__init__(self, name='DaskClient-%s' % clientId) self.client = Client(clientUrl) self.clientId = clientId self.daqObjectGenerator = daqObjectGenerator self.resultQ = resultQ self.idQ = Queue() self.remoteIdQ = self.client.scatter(self.idQ) self.generatorQ = self.client.map(self.daqObjectGenerator.generate, self.remoteIdQ) self.pvQ = self.client.gather(self.generatorQ) self.nGenerated = 0 self.event = Event() def putTask(self, objectId): #t0 = time.time() self.idQ.put(objectId) #t1 = time.time() #dt = t1-t0 #print('PUSH TASK: %s' % dt) #self.event.set() def getPv(self, timeout=None): #t0 = time.time() pv = self.pvQ.get(timeout=timeout) #t1 = time.time() #dt = t1-t0 #print('GET PV: %s' % dt) return pv def run(self): print('STARTING THREAD, CLIENT ID: %s' % self.clientId) while True: pv = self.pvQ.get(timeout=None) self.nGenerated += 1 #print('GOT PV , CLIENT ID %s: %s' % (self.clientId, pv['ArrayId'])) #print('CLIENT ID %s: N GENERATED=%s' % (self.clientId, self.nGenerated)) self.resultQ.put((pv, self.clientId))
def parallelStatsDaskSimple(urlSplits, ds, nEpochs, variable, mask, coordinates, reader, outHdfsPath, averagingConfig, sparkConfig, accumulators=['count', 'mean', 'M2', 'min', 'max']): '''Compute N-day climatology statistics in parallel using PySpark or pysparkling.''' if not sparkConfig.startswith('dask,'): print("dask: configuration must be of form 'dask,n'", file=sys.stderr) sys.exit(1) numPartitions = int(sparkConfig.split(',')[1]) with Timer("Configure Dask distributed"): from distributed import Client, as_completed client = Client(DaskClientEndpoint) print('Starting parallel Stats using Dask . . .', file=sys.stderr) start = time.time() futures = client.map( lambda urls: parallelStatsPipeline( urls, ds, nEpochs, variable, mask, coordinates, reader, averagingConfig, outHdfsPath, accumulators), urlSplits) outputFiles = [] for future in as_completed(futures): outputFile = future.result() outputFiles.append(outputFile) end = time.time() print("parallelStats: Completed %s in %0.3f seconds." % (outputFile, (end - start)), file=sys.stderr) return outputFiles
def test_distributed_handler_distributed(values, expected_values): cluster = LocalCluster(processes=False) with DistributedHandler(cluster.scheduler_address) as handler: futures = handler.client.map(lambda x: x + 1, values) handler_map_results = handler.gather(futures) with DistributedHandler(cluster.scheduler_address) as handler: handler_batched_results = handler.batched_map(lambda x: x + 1, values) client = Client(cluster) futures = client.map(lambda x: x + 1, values) distributed_results = client.gather(futures) handler_map_results = set(handler_map_results) handler_batched_results = set(handler_batched_results) distributed_results = set(distributed_results) assert (handler_map_results == handler_batched_results and handler_map_results == distributed_results) cluster.close()
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features Parameters ---------- address : str The ip address and port number of the Dask Scheduler """ super().__init__() from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. Parameters ---------- data_length: int A length which defines how many calculations there need to be. """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster Parameters ---------- func : Callable Function to send to each worker. partitioned_chunks : List List of data chunks, each chunk is processed by one woker kwargs : Dict Parameters for the map function Returns ------- List The result of the calculation as a list - each item should be the result of the application of func to a single element """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return result def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
assert adadamp.__version__ == "0.1.4" return train.main(epochs=epochs, verbose=False, seed=seed, tuning=False, **kwargs) futures = [] seeds = np.arange(seed_start, seed_start + n_runs) dampers = ["adadamp", "padadamp", "geodamp", "adagrad", "geodamplr"] assert set(dampers) == set(params.keys()) for damper in dampers: kwargs = params[damper] futures.extend(client.map(submit, seeds, **kwargs)) for future in as_completed(futures): try: data, train_data = future.result() # data, train_data = future except: # KilledWorker: # This is likely a problem with my code rather than with the # Dask cluster. # # https://stackoverflow.com/questions/46691675/what-do-killedworker-exceptions-mean-in-dask print("-" * 20) for info in sys.exc_info(): print(info) else: df = pd.DataFrame(data)
a.append(url) return a def get_url(r): url = 'https://s3.amazonaws.com/cloudydap/bytestream/'+r['md5'] return url def compute(url): # print url response = urllib2.urlopen(url) buf = response.read() # print len(buf) dec = zlib.decompressobj(32+zlib.MAX_WBITS) unzipped = dec.decompress(buf) # print len(unzipped) # Pick a specific point a = unzipped[1]+unzipped[13104]+unzipped[26208]+unzipped[39312] # print struct.unpack('<f', a) return struct.unpack('<f', a) # a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:MERRA2_100*") a = search("PRECCU AND chunk_position:\[0,91,288\] AND filename:MERRA2_100*") # a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:*tavgM_2d_int_*") # search("PRECCU AND chunk_position:\[0,91,288\] AND filename: MERRA2_400.tavgM_2d_int_Nx.201507.nc4") c = Client('localhost:8786') m = c.map(compute, a) x = c.gather(m) print x
arg_parser.add_argument('--scheduler_port', default='8786', help='scheduler port to use') arg_parser.add_argument('--n', type=int, default=100, help='number of terms in sum') arg_parser.add_argument('--verbose', action='store_true', help='give verbose output') options = arg_parser.parse_args() client = Client('{0}:{1}'.format(options.scheduler, options.scheduler_port)) if options.verbose: print('Client: {0}'.format(str(client)), flush=True) futures = client.map(square, range(options.n)) total = client.submit(sum, futures) expected_total = (options.n - 1) * options.n * (2 * options.n - 1) // 6 print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format( total.result(), expected_total)) futures = client.map(get_hostname, range(options.n)) process_locations = client.gather(futures) if options.verbose: print('task placement:') print('\t' + '\n\t'.join(process_locations)) count = dict() for process_location in process_locations: _, _, hostname = process_location.split() if hostname not in count: count[hostname] = 0 count[hostname] += 1
return self.y * x**2**2**2 def bogus_helper(_args): bog, x = _args return bog.square(x) def square(x): return x**2**2**2 def neg(x): return -x # submit many function calls: A = client.map(square, range(10)) # print(A) B = client.map(neg, A) # print(B) # submit individual function calls: total = client.submit(sum, B) print(total.result()) bg = Bogus(2) args = [[bg, x] for x in range(10)] C = client.map(bogus_helper, args) results = client.gather(C) print(results)
parallel=False) pysp2.io.write_dat(my_binary, out_path + base + '.particle.dat') # Get all of the unique dates all_sp2_files = glob(sp2b_path + '*.sp2b') sp2_date_list = [x.split(".")[3] for x in all_sp2_files] sp2_date_list = sorted(list(set(sp2_date_list))) #sp2_date_list = ['20200218'] #process_day('20200218') print(sp2_date_list) cluster = PBSCluster(processes=6, cores=36, walltime='5:00:00', memory='270GB', name='dask-worker', queue='arm_high_mem', project='arm', job_extra=['-W group_list=cades-arm'], interface='ib0', extra=['--no-dashboard']) cluster.scale(36 * 6) client = Client(cluster) print("Waiting for workers before starting processing...") client.wait_for_workers(9) print(client) results = client.map(process_day, sp2_date_list) wait(results) #del client
# Set up scheduler s = Scheduler(loop=loop) s.start() #Set up Workers w = Worker('comet-14-02.sdsc.edu', loop=loop) w.start(0) # Set up client client = Client('comet-14-02.sdsc.edu:8786') def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] #pprint.pprint(list(chunks(range(0, 255), 64))) output = [] y = list(chunks(range(0, 255), 64)) #print y[0] for ix in y: a = client.map(sum, ix) output.append(a) total = client.submit(sum, output) total.visualize() print total.compute() client.gather(total)
def preprocessing_script(): """ This script will process all the hybridization folders combined in a processing folder. The input parameters are passed using arparse Parameters: ----------- scheduler: string tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). default = False. If False the process will run on the local computer using nCPUs-1 path: string Path to the processing directory """ # Inputs of the function parser = argparse.ArgumentParser(description='Preprocessing script') parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003') parser.add_argument('-path', help='processing directory') args = parser.parse_args() # Directory to process processing_directory = args.path # Dask scheduler address scheduler_address = args.scheduler if scheduler_address: # Start dask client on server or cluster client=Client(scheduler_address) else: # Start dask client on local machine. It will use all the availabe # cores -1 # number of core to use ncores = multiprocessing.cpu_count()-1 cluster = LocalCluster(n_workers=ncores) client=Client(cluster) # Subdirectories of the processing_directory that need to be skipped for the # analysis blocked_directories = ['_logs'] # Starting logger utils.init_file_logger(processing_directory) logger = logging.getLogger() # Determine the operating system running the code os_windows, add_slash = utils.determine_os() # Check training slash in the processing directory processing_directory=utils.check_trailing_slash(processing_directory,os_windows) # Get a list of the hybridization to process processing_hyb_list = next(os.walk(processing_directory))[1] # Remove the blocked directories from the directories to process processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ] for processing_hyb in processing_hyb_list: # Determine the hyb number from the name hybridization_number = processing_hyb.split('_hyb')[-1] hybridization = 'Hybridization' + hybridization_number hyb_dir = processing_directory + processing_hyb + add_slash # Parse the Experimental metadata file (serial) experiment_infos,image_properties, hybridizations_infos, \ converted_positions, microscope_parameters =\ utils.experimental_metadata_parser(hyb_dir) # Parse the configuration file flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir) # ----------------- .nd2 FILE CONVERSION ------------------------------ # Create the temporary subdirectory tree (serial) tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\ hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash) # Get the list of the nd2 files to process inside the directory files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2') # Get the list of genes that are analyzed in the current hybridization gene_list = list(hybridizations_infos[hybridization].keys()) # Organize the file to process in a list which order match the gene_list for # parallel processing organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f ] organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f ] # Each .nd2 file will be processed in a worker part of a different node # Get the addresses of one process/node to use for conversion node_addresses = utils.identify_nodes(client) workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()] # Run the conversion futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list, tmp_gene_dirs,processing_hyb=processing_hyb, use_ram=flt_rawcnt_config['use_ram'], max_ram=flt_rawcnt_config['max_ram'], workers=workers_conversion) client.gather(futures_processes) # --------------------------------------------------------------------- # ----------------- FILTERING AND RAW COUNTING ------------------------ # Create directories # Create the directory where to save the filtered images suffix = 'filtered_png' filtered_png_img_dir_path, filtered_png_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) suffix = 'filtered_npy' filtered_img_dir_path, filtered_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Create the directory where to save the counting suffix = 'counting' counting_dir_path, counting_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,flt_rawcnt_config['skip_tags_counting'], flt_rawcnt_config['skip_genes_counting'], analysis_name=flt_rawcnt_config['analysis_name']) if flt_rawcnt_config['illumination_correction']: # Create the directory where to save the counting suffix = 'illumination_funcs' illumination_func_dir_path, illumination_func_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Loop through channels and calculate illumination for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') logger.debug('Create average image for gene %s', gene) # Chunking the image list num_chunks = sum(list(client.ncores().values())) chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks) # Scatter the images sublists to process in parallel futures = client.scatter(chunked_list) # Create dask processing graph output = [] for future in futures: ImgMean = delayed(utils.partial_image_mean)(future) output.append(ImgMean) ImgMean_all = delayed(sum)(output) ImgMean_all = ImgMean_all/float(len(futures)) # Compute the graph ImgMean = ImgMean_all.compute() logger.debug('Create illumination function for gene %s',gene) # Create illumination function Illumination=filters.gaussian(ImgMean,sigma=(20,300,300)) # Normalization of the illumination Illumination_flat=np.amax(Illumination,axis=0) Illumination_norm=Illumination_flat/np.amax(Illumination_flat) logger.debug('Save illumination function for gene %s',gene) # Save the illumination function illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0] illumination_fname=illumination_path+gene+'_illumination_func.npy' np.save(illumination_fname,Illumination_norm,allow_pickle=False) # Broadcast the illumination function to all the cores client.scatter(Illumination_norm, broadcast=True) logger.debug('Filtering %s',gene) # Filtering and counting futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \ illumination_function=Illumination_norm,\ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\ filtered_img_gene_dirs =filtered_img_gene_dirs,\ counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \ min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) else: for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') # filtering logger.debug('Filtering without illumination correction %s',gene) futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \ filtered_img_gene_dirs=filtered_img_gene_dirs, \ counting_gene_dirs=counting_gene_dirs, \ plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\ stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) # --------------------------------------------------------------------- # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------ # # Combine the filter data in one single .ppf for each hybridization # # This step will run in serial mode and will not need to shuffle data # # between cores because everything is on the common file system # logger.debug('Create .ppf.hdf5 file') # # Create the ppf.hdf5 file that contains the filtered data in uint16 # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb, # hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties) # logger.debug('Write the .npy filtered files into the .ppf file') # # Load and write the .npy tmp images into the hdf5 file # # open the hdf5 file # with h5py.File(preprocessing_file_path) as f_hdl: # # Loop through each gene # for gene in hybridizations_infos[hybridization].keys(): # logger.debug('Writing %s images in .ppf.hdf5',gene) # # list of the files to transfer # filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # loop through the list of file # for f_file in filtered_files_list: # pos = f_file.split('/')[-1].split('_')[-1].split('.')[0] # f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file) # f_hdl.flush() # # --------------------------------------------------------------------- # # ----------------- STITCHING ------------------------ # # Load the stitching parameters from the .yaml file # # Stitch the image in 2D or 3D (3D need more work/testing) # nr_dim = flt_rawcnt_config['nr_dim'] # # Estimated overlapping between images according to the Nikon software # est_overlap = image_properties['Overlapping_percentage'] # # Number of peaks to use for the alignment # nr_peaks = flt_rawcnt_config['nr_peaks'] # # Determine if the coords need to be flipped # y_flip = flt_rawcnt_config['y_flip'] # # Method to use for blending # # can be 'linear' or 'non linear' # # The methods that performs the best is the 'non linear' # blend = flt_rawcnt_config['blend'] # # Reference gene for stitching # reference_gene = flt_rawcnt_config['reference_gene'] # pixel_size = image_properties['PixelSize'] # # Get the list of the filtered files of the reference gene # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # Create pointer of the hdf5 file that will store the stitched reference image # # for the current hybridization # # Writing # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb # data_name = (tile_file_base_name # + '_' + reference_gene # + '_stitching_data') # stitching_file_name = tile_file_base_name + '.sf.hdf5' # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest') # replace with 'a' as soon as you fix the error # # Determine the tiles organization # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization, # est_overlap = est_overlap, y_flip = False, nr_dim = 2) # # Align the tiles # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples, # filtered_files_list=filtered_files_list,micData=micData, # nr_peaks=nr_peaks) # # Gather the futures # data = client.gather(futures_processes) # # In this case the order of the returned contingency tuples is with # # the order of the input contig_tuples # # P_all = [el for data_single in data for el in data_single[0]] # P_all =[data_single[0] for data_single in data ] # P_all = np.array(P_all) # P_all = P_all.flat[:] # covs_all = [data_single[1] for data_single in data] # alignment = {'P': P_all, # 'covs': covs_all} # # Calculates a shift in global coordinates for each tile (global # # alignment) and then applies these shifts to the corner coordinates # # of each tile and returns and saves these shifted corner coordinates. # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples, # micData, nr_pixels, z_count, # alignment, data_name, # nr_dim=nr_dim) # # Create the hdf5 file structure # stitched_group, linear_blending, blend = hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels, # reference_gene, blend = 'non linear') # # Fill the hdf5 containing the stitched image with empty data and # # create the blending mask # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64) # if blend is not None: # # make mask # stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64) # tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask']) # # Create the subdirectory used to save the blended tiles # suffix = 'blended_tiles' # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Get the directory with the filtered npy images of the reference_gene to use for stitching # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0] # # Create the tmp directory where to save the masks # suffix = 'masks' # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Create and save the mask files # for corn_value,corner_coords in joining['corner_list']: # if not(np.isnan(corner_coords[0])): # cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels), # int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)] # fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value) # np.save(fname,cur_mask) # # Blend all the tiles and save them in a directory # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'], # stitching_files_dir = stitching_files_dir, # blended_tiles_directory = blended_tiles_directory, # masked_tiles_directory = masked_tiles_directory, # analysis_name = flt_rawcnt_config['analysis_name'], # processing_hyb = processing_hyb,reference_gene = reference_gene, # micData = micData,tiles = tiles,nr_pixels=nr_pixels, # linear_blending=linear_blending) # _ = client.gather(futures_processes) # # Write the stitched image # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels) # # close the hdf5 file # stitching_file.close() # # Delete the directories with blended tiles and masks # shutil.rmtree(blended_tiles_directory) # shutil.rmtree(masked_tiles_directory) # ----------------- DELETE FILES ------------------------ # Don't delete the *.npy files here because can be used to # create the final images using the apply stitching related function client.close()
from distributed import Client import time client = Client("192.168.0.106:8786") client.restart() from funcs import create_dirs, get_dirs, add_flag future = client.map(create_dirs, range(100)) flags = client.submit(get_dirs, future) client.gather(flags) print(flags)
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
def do(param): dataset = pickle.load(open(f'{os.environ["HOME"]}/dataset.pkl', 'rb')) Xs, ys, Xst, yst = dataset criterion, n_estimators, max_features, max_depth = param model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth) model.fit(Xs, ys) ysp = model.predict(Xst) acc = accuracy_score(yst, ysp) print(acc) return [acc, list(param)] params = [] for cri in ['gini', 'entropy']: for n_esti in range(5, 15): for max_features in range(10, 20): for max_depth in range(4, 20): params.append((cri, n_esti, max_features, max_depth)) L = client.map(do, params) ga = client.gather(L) import json json.dump(ga, open('ga.json', 'w'), indent=2) print(ga)
# dask client from distributed import Client from os.path import join from math import ceil from thredds_configuration import file_list_url, data_request, data_folder, thredds_servers from dask_configuration import dask_scheduler_url from thredds_utils import list_thredds_folder, compute_url_to_thredds_server_map, compute_avg_func array_list = [] file_list = list_thredds_folder(file_list_url) # connect to dask client = Client(dask_scheduler_url) url_list = [] for f in file_list: url_list.append(data_request + "/" + data_folder + "/" + f + "?time1[0],Temperature_surface[0][0:360][0:719]") # allocate url to threads servers server_url_mapping = compute_url_to_thredds_server_map(url_list, thredds_servers) # launch the dask computation and collect results avg_results_status = client.map(compute_avg_func, server_url_mapping) avg_results = client.gather(avg_results_status) final_avg = np.mean(avg_results) print(final_avg)
###Aux channels### ################## chunk = 16384 pad = 256 # Find the data #cache1=find_raw_frames(ifo, st1, st1+dur) #cache2=find_raw_frames(ifo, st2, st2+dur) # Connect to Dask scheduler client = Client(args.address) for t1, t2 in chunk_segments(segs, chunk, pad): print 'Getting chunk', t1, t2 # Set up the channel list params_list = [(chan, ifo, t1, t2) for chan in channels ] #Add in st1, st2, dur for psd comparison tool # Run jobs on the cluster and return results jobs = client.map(aux_feat_get, params_list) result = client.gather(jobs) # Write out the results #Will sort the results by how much difference in the PSD there is #result.sort(key=lambda x: x[1], reverse=True) with open('results_of_aux_%u-%u.dat' % (t1, (t2 - t1)), 'wb') as fout: pickle.dump(result, fout)
help='port of the dask scheduler') options = arg_parser.parse_args() client = Client(f'{options.host}:{options.port:d}') if options.implementation == 'python': from julia_python import julia_set elif options.implementation == 'cython': from julia_cython import julia_set client.register_worker_callbacks(init_pyx) elif options.implementation == 'cython_omp': from julia_cython_omp import julia_set client.register_worker_callbacks(init_omp_pyx) else: msg = '{0} version not implemented\n' sys.stderr.write(msg.format(options.implementation)) sys.exit(1) domain = init_julia((options.re_min, options.re_max), (options.im_min, options.im_max), (options.n_re, options.n_im)) domains = np.array_split(domain, options.partitions) iterations = np.array_split( np.zeros(options.n_re * options.n_im, dtype=np.int32), options.partitions) start_time = time.time() futures = client.map(julia_set, domains, iterations) results = client.gather(futures) end_time = time.time() print('compute time = {0:.6f} s'.format(end_time - start_time)) np.savetxt('julia.txt', np.concatenate(results).reshape(options.n_re, options.n_im))
async def assert_basic_futures(c: Client) -> None: futures = c.map(inc, range(10)) results = await c.gather(futures) assert results == list(map(inc, range(10)))
if __name__ == '__main__': arg_parser = ArgumentParser(description='compute sum of squares and check ' 'task placement') arg_parser.add_argument('--scheduler', help='scheduler host') arg_parser.add_argument('--scheduler_port', default='8786', help='scheduler port to use') arg_parser.add_argument('--n', type=int, default=100, help='number of terms in sum') arg_parser.add_argument('--verbose', action='store_true', help='give verbose output') options = arg_parser.parse_args() client = Client('{0}:{1}'.format(options.scheduler, options.scheduler_port)) if options.verbose: print('Client: {0}'.format(str(client)), flush=True) futures = client.map(square, range(options.n)) total = client.submit(sum, futures) expected_total = (options.n - 1)*options.n*(2*options.n - 1)//6 print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(total.result(), expected_total)) futures = client.map(get_hostname, range(options.n)) process_locations = client.gather(futures) if options.verbose: print('task placement:') print('\t' + '\n\t'.join(process_locations)) count = dict() for process_location in process_locations: _, _, hostname = process_location.split() if hostname not in count: count[hostname] = 0 count[hostname] += 1