def multiprocess(func, parameters, num_workers=None, context=None): """ Run the function with the parameters in parallel using multiprocessing. ``context`` is one of ``{"fork", "spawn", "forkserver"}``. For dask<2.16.0,the default context is "fork" and for dask>=2.16.0, the default is "spawn". """ bag = dask.bag.from_sequence(parameters) config = {'scheduler': 'processes'} if context is not None: config['multiprocessing.context'] = context elif hasattr(parameters[0], 'multiprocessing_context'): config['multiprocessing.context'] = \ parameters[0].multiprocessing_context with dask.config.set(config): if num_workers: results = bag.map(func).compute(num_workers=num_workers) elif hasattr(parameters[0], 'num_workers'): results = bag.map(func).compute( num_workers=parameters[0].num_workers) else: # num of workers is defaulted to the number of logical processes results = bag.map(func).compute() return results
def map_wrapper(function_item,list_items,other_args=None): from dask.distributed import Client import dask.bag as db c = Client() NCORES = len(c.ncores().values())-2 b0 = db.from_sequence(list_items, npartitions=NCORES) if other_args is not None: list_items = list(db.map(function_item,b0,other_args).compute()) else: list_items = list(db.map(function_item,b0).compute()) return list_items
def multiprocess(func, parameters, num_workers=None): """Run the function with the parameters in parallel using multiprocessing.""" bag = dask.bag.from_sequence(parameters) with dask.set_options(get=dask.multiprocessing.get): if num_workers: results = bag.map(func).compute(num_workers=num_workers) elif hasattr(parameters[0], 'num_workers'): results = bag.map(func).compute(num_workers=parameters[0].num_workers) else: # num of workers is defaulted to the number of logical processes results = bag.map(func).compute() return results
def create_index_dependend_grouped_residual(scheme, parameter, problem_bag, constraint_labels_and_matrices, residual_function): def penalty_function(problem, labels_and_matrices): clp, residual = residual_function(labels_and_matrices.matrix, problem.data) penalty = residual if callable(scheme.model.has_additional_penalty_function): if scheme.model.has_additional_penalty_function(): additional_penalty = scheme.model.additional_penalty_function( parameter, labels_and_matrices.clp_label, clp, problem.index) penalty = np.concatenate([penalty, additional_penalty]) return clp, residual, penalty penalty_bag = \ db.map(penalty_function, problem_bag, constraint_labels_and_matrices) reduced_clp_labels = constraint_labels_and_matrices.pluck(0) reduced_clps = penalty_bag.pluck(0) residuals = penalty_bag.pluck(1) penalty = dask.delayed(np.concatenate)(penalty_bag.pluck(2)) return reduced_clp_labels, reduced_clps, residuals, penalty
def test_01a_compute_score(dtcpop): from neuronunit.optimization import get_neab from neuronunit.optimization.optimization_management import dtc_to_rheo from neuronunit.optimization.optimization_management import nunit_evaluation from neuronunit.optimization.optimization_management import format_test #dtcpop = grid_points() dtclist = list(map(dtc_to_rheo, dtcpop)) for d in dtclist: assert len(list(d.attrs.values())) > 0 import dask.bag as db b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(format_test, b0).compute()) b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(nunit_evaluation, b0).compute()) return dtclist
def main(): # noqa: D103 bgen_dir = f'{ukb}/array_imputed' output_dir = f'{bgen_dir}/output' assert os.path.exists(bgen_dir) assert os.path.exists(output_dir) # why not retry? nothing else I can do dask.config.set({'distributed.scheduler.allowed-failures': 99}) # Maximum of 10 concurrent downloads per application # See here: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=644 client = dask.distributed.Client( n_workers=10, local_directory="/oasis/tscc/scratch/jmargoli" ) jobs = [] # calculate number of download batches for chrom in range(1,23): jobs.append((ukb, chrom, bgen_dir)) print(f"Number of jobs queued: {len(jobs)}", flush = True) bag = dask.bag.from_sequence(jobs) downloads = bag.map(download_item) client.compute(downloads, retries=99).result() # wait for the result so
def test_01a_compute_score(dtcpop): from neuronunit.optimization import get_neab from neuronunit.optimization.optimization_management import dtc_to_rheo from neuronunit.optimization.optimization_management import nunit_evaluation from neuronunit.optimization.optimization_management import format_test #dtcpop = grid_points() dtclist = list(map(dtc_to_rheo,dtcpop)) for d in dtclist: assert len(list(d.attrs.values())) > 0 import dask.bag as db b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(format_test,b0).compute()) b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(nunit_evaluation,b0).compute()) return dtclist
def find_rheobase(self, dtc): assert os.path.isfile( dtc.model_path), "%s is not a file" % dtc.model_path # If this it not the first pass/ first generation # then assume the rheobase value found before mutation still holds until proven otherwise. # dtc = check_current(model.rheobase,dtc) # If its not true enter a search, with ranges informed by memory cnt = 0 while dtc.boolean == False: #dtc.current_steps = list(filter(lambda cs: cs !=0.0 , dtc.current_steps)) dtc_clones = [ copy.copy(dtc) for i in range(0, len(dtc.current_steps)) ] for i, s in enumerate(dtc.current_steps): dtc_clones[i].ampl = None dtc_clones[i].ampl = dtc.current_steps[i] b0 = db.from_sequence(dtc_clones, npartitions=8) #dtc_clone = list(map(check_current,dtc_clones)) dtc_clone = list(db.map(check_current, b0).compute()) for d in dtc_clone: dtc.lookup.update(d.lookup) #print(dtc.lookup) dtc = check_fix_range(dtc) cnt += 1 print(cnt, 'cnt value') #print(type(dtc.current_steps)) #print(dtc.current_steps,'this stays small') return dtc
def create_index_independend_grouped_residual(scheme, parameter, problem_bag, constraint_labels_and_matrices, residual_function): matrix_labels = problem_bag.pluck(1)\ .map(lambda group: "".join(problem.dataset for problem in group))\ def penalty_function(matrix_label, problem, labels_and_matrices): clp, residual = residual_function( labels_and_matrices[matrix_label].matrix, problem.data) penalty = residual if callable(scheme.model.has_additional_penalty_function): if scheme.model.has_additional_penalty_function(): additional_penalty = scheme.model.additional_penalty_function( parameter, labels_and_matrices[matrix_label].clp_label, clp, problem.index) penalty = np.concatenate([penalty, additional_penalty]) return clp, residual, penalty penalty_bag = \ db.map(penalty_function, matrix_labels, problem_bag, constraint_labels_and_matrices) reduced_clp_label = { label: constraint_labels_and_matrices[label].clp_label for label in constraint_labels_and_matrices } reduced_clps = penalty_bag.pluck(0) residuals = penalty_bag.pluck(1) penalty = dask.delayed(np.concatenate)(penalty_bag.pluck(2)) return reduced_clp_label, reduced_clps, residuals, penalty
def test_grid_dimensions(self): from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False from neuronunit.optimization import exhaustive_search from neuronunit.optimization.optimization_management import map_wrapper import dask.bag as db npoints = 2 nparams = 3 for i in range(1,10): for j in range(1,10): grid_points = exhaustive_search.create_grid(npoints = i, nparams = j) b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute()) self.assertEqual(i*j,len(dtcpop)) self.assertNotEqual(dtcpop,None) dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,grid_points[0:2]) self.assertNotEqual(dtcpop_compare,None) self.assertEqual(len(dtcpop_compare),len(dtcpop)) for i,j in enumerate(dtcpop): for k,v in dtcpop_compare[i].attrs.items(): print(k,v,i,j) self.assertEqual(j.attrs[k],v) return True
def test_grid_dimensions(self): from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False from neuronunit.optimization import exhaustive_search from neuronunit.optimization.optimization_management import map_wrapper import dask.bag as db npoints = 2 nparams = 3 for i in range(1, 10): for j in range(1, 10): grid_points = exhaustive_search.create_grid(npoints=i, nparams=j) b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list( db.map(exhaustive_search.update_dtc_grid, b0).compute()) self.assertEqual(i * j, len(dtcpop)) self.assertNotEqual(dtcpop, None) dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid, grid_points[0:2]) self.assertNotEqual(dtcpop_compare, None) self.assertEqual(len(dtcpop_compare), len(dtcpop)) for i, j in enumerate(dtcpop): for k, v in dtcpop_compare[i].attrs.items(): print(k, v, i, j) self.assertEqual(j.attrs[k], v) return True
def map_wrapper(function_item, list_items): from dask.distributed import Client import dask.bag as db c = Client() NCORES = len(c.ncores().values()) b0 = db.from_sequence(list_items, npartitions=NCORES) list_items = list(db.map(function_item, b0).compute()) return list_items
def calculate_descriptors(self, molecules: List[Molecule]) -> List[Molecule]: molecules = bag.map(self.descriptor, bag.from_sequence(molecules)).compute() molecules = [ molecule for molecule in molecules if all(1.0 > property > 0.0 for property in molecule.descriptor) ] return molecules
def cas(self): grid = db.from_sequence(self.files, npartitions=8) urlDats = list(db.map(convert_and_score, grid).compute()) urlDats = list(filter(lambda url: len(list(url)) > 3, urlDats)) urlDats = list(filter(lambda url: len(list(url.keys())) > 3, urlDats)) urlDats = list( filter(lambda url: str('penalty') in url.keys(), urlDats)) if type(self.urlDats) is not type(None): urlDats.extend(self.urlDats) return urlDats
def _get_rays_d(lengths, stepSize, start_positions, scaled_look_vecs, Nproc=2): import dask.bag as db L = db.from_sequence(lengths) S = db.from_sequence(start_positions) Sv = db.from_sequence(scaled_look_vecs) Ss = db.from_sequence([stepSize] * len(lengths)) # setup for multiprocessing data = db.zip(L, S, Sv, Ss) positions_l = db.map(helper, data) return positions_l.compute()
def run_grid(npoints,nparams,provided_keys=None): # not all models will produce scores, since models with rheobase <0 are filtered out. from neuronunit.optimization.optimization_management import nunit_evaluation from neuronunit.optimization.optimization_management import update_dtc_pop grid_points = create_grid(npoints = npoints,nparams = nparams,vprovided_keys = provided_keys ) import dask.bag as db b = db.bag(grid_points) dtcpop = list(db.map(update_dtc_pop,b).compute()) print(dtcpop) # The mapping of rheobase search needs to be serial mapping for now, since embedded in it's functionality is a # probably this can be bypassed in the future by using zeromq's Client (by using ipyparallel's core module/code base more directly) dtcpop = list(map(dtc_to_rheo,dtcpop)) print(dtcpop) filtered_dtcpop = list(filter(lambda dtc: dtc.rheobase['value'] > 0.0 , dtcpop)) dtcpop = list(db.map(nunit_evaluation,filtered_dtcpop).compute()) dtcpop = list(dtcpop) dtcpop = list(filter(lambda dtc: type(dtc.scores['RheobaseTestP']) is not type(None), dtcpop)) return dtcpop
def test_01a_compute_score(dtcpop, tests): from neuronunit.optimization import get_neab from neuronunit.optimization.optimization_management import dtc_to_rheo from neuronunit.optimization.optimization_management import nunit_evaluation from neuronunit.optimization.optimization_management import format_test from itertools import repeat #dtcpop = grid_points() rheobase_test = tests[0][0][0] xargs = list(zip(dtcpop, repeat(rheobase_test), repeat('NEURON'))) dtclist = list(map(dtc_to_rheo, xargs)) #dtclist = list(map(dtc_to_rheo,dtcpop)) for d in dtclist: assert len(list(d.attrs.values())) > 0 import dask.bag as db b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(format_test, b0).compute()) b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(nunit_evaluation, b0).compute()) return dtclist
def dask_map(self, f, x: list) -> list: """A mapping function for Dask. Used for multithreading. Args: f: Any function. x (list): A list of inputs to be sequentially passed to that function. Returns: list: A list of outputs from that function. """ x = db.from_sequence(x, npartitions=self.npartitions) return db.map(f, x).compute()
def grid_points(): npoints = 2 nparams = 10 from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False from neuronunit.optimization import exhaustive_search grid_points = exhaustive_search.create_grid(npoints = npoints,nparams = nparams) import dask.bag as db b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute()) assert dtcpop is not None return dtcpop
def grid_points(): npoints = 2 nparams = 10 from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False from neuronunit.optimization import exhaustive_search grid_points = exhaustive_search.create_grid(npoints=npoints, nparams=nparams) import dask.bag as db b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list(db.map(exhaustive_search.update_dtc_grid, b0).compute()) assert dtcpop is not None return dtcpop
def get_bmarks(): xkcd_self_sufficient = str('http://splasho.com/upgoer5/library.php') high_standard = str( 'https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvMjc3MjUvZWxpZmUtMjc3MjUtdjIucGRm/elife-27725-v2.pdf?_hash=WA%2Fey48HnQ4FpVd6bc0xCTZPXjE5ralhFP2TaMBMp1c%3D' ) the_science_of_writing = str( 'https://cseweb.ucsd.edu/~swanson/papers/science-of-writing.pdf') pmeg = str( 'http://www.elsewhere.org/pomo/' ) # Note this is so obfuscated, even the english language classifier rejects it. links = [xkcd_self_sufficient, high_standard, the_science_of_writing, pmeg] royal = '../BenchmarkCorpus/royal.txt' klpd = '../BenchmarkCorpus/planning_document.txt' klpdf = open(klpd) strText = klpdf.read() urlDat = {'link': 'local_resource'} klpdfp = text_proc(strText, urlDat, WORD_LIM=100) grid = db.from_sequence(links, npartitions=8) urlDats = list(db.map(process, grid).compute()) urlDats.append(klpdfp) print(urlDats) klpdr = open(royal) strText = klpdr.read() urlDat = {'link': 'local_resource_royal'} klpdfr = text_proc(strText, urlDat, WORD_LIM=100) print(klpdfr) grid = db.from_sequence(links, npartitions=8) urlDats = list(db.map(process, grid).compute()) urlDats.append(klpdfp) with open('benchmarks.p', 'wb') as f: pickle.dump(urlDats, f) return urlDats
def pmap(function, inputs, multiple=False, predicate=None): # type: (Callable[[Any], Any], Iterable[Iterable[Any]], bool, Callable[[Any], Any]) -> Iterable[Any] """ Do a parallel map of the given :code:`function` on the given :code:`inputs` and optionally filter its results with :code:`predicate`. This is a simple wrapper for `dask`_ and works like :func:`map` but in parallel. .. code-block:: python fun = lambda d: SHA1.new(d).hexdigest() inputs = [b'1234', b'5678', b'9101', b'1121'] assert (pmap(fun, inputs) == ['7110eda4d09e062aa5e4a390b0a572ac0d2c0220', '2abd55e001c524cb2cf6300a89ca6366848a77d5', 'f5a6fe40024c28967a354e591bb9fa21b784bf00', '784e9240155834852dff458a730cceb50229df32']) predicate = lambda d: d.endswith('0') assert (pmap(fun, inputs, predicate=predicate) == ['7110eda4d09e062aa5e4a390b0a572ac0d2c0220', 'f5a6fe40024c28967a354e591bb9fa21b784bf00']) .. _`dask`: https://docs.dask.org/en/latest/ :param function: An arbitrary function that's mapped to the :code:`inputs`. :param inputs: Inputs for :code:`function`. Pass multiple if your function takes multiple inputs. :param multiple: Specifies whether :code:`inputs` contains multiple inputs or not. If you want to e.g. pass two lists :code:`xs` and :code:`ys` to :code:`function = lambda x,y: x + y`, you can pass :code:`inputs = [xs, ys]` and `multiples=True` to interpret :code:`inputs` as inputs for multiple arguments. :param filter_: An optional filter function to filter the results of the :param predicate: An optional filter function to filter the results of the computation. If none is passed, all results will be returned. :returns: The results of applying :code:`function` to :code:`inputs`. .. CAUTION:: :code:`predicate` needs to be passed as a **keyword argument**, otherwise it will be treated as an input parameter to :code:`function`! """ if not multiple: inputs = [inputs] promises = parallel.map(function, *[parallel.from_sequence(i) for i in inputs]) if not predicate: return list(promises) else: return list(promises.filter(predicate))
def cas(self): # Do in parallel as it is 2018 pgrid = db.from_sequence(self.files, npartitions=8) urlDats = list(db.map(self.convert_and_score, pgrid).compute()) # just kidding need to do a serial debug often times, regardless of parallel speed up. # urlDats = list(map(self.convert_and_score,self.files)) urlDats = [url for url in urlDats if type(url) is not type(None)] # urlDats = list(filter(lambda url: type(url) != None, urlDats)) urlDats = list(filter(lambda url: len(list(url)) > 3, urlDats)) urlDats = list(filter(lambda url: len(list(url.keys())) > 3, urlDats)) # urlDats = list(filter(lambda url: str('penalty') in url.keys(), urlDats)) if type(self.urlDats) is not type(None): urlDats.extend(self.urlDats) return urlDats
def predict(atm, series, dev_length, val_length, HP, week, mon, horizon, cpoint=0.05): #Predict for development dataset fitted_model, model1 = train(series[0:dev_length], series[0:dev_length], HP, week, mon, cpoint=cpoint) model1['pred_day'] = 'day 0' model1['tid'] = atm rmse1 = model1.groupby([model1.ds.dt.month, 'pred_day']).se.mean().agg(np.sqrt).reset_index() dbseries = db.from_sequence( [series[i:i + dev_length] for i in range(val_length)]) dbforecast = db.from_sequence([ series[i + dev_length:i + dev_length + horizon][['ds', 'y']] for i in range(val_length) ]) dbmaster = db.map(train, dbseries, dbforecast, HP, week, mon, cpoint=cpoint) modelf = dbmaster.compute() model = pd.concat([modelf[i][1] for i in range(len(model))]) model['tid'] = atm rmse = model.groupby([model.ds.dt.month, 'pred_day']).se.mean().agg(np.sqrt).reset_index() print(f'\nATM:{atm} RMSE:{rmse.se.mean()}') return modelf[len(val_length) - 1][0], pd.concat( [model1, model]), pd.concat([rmse1, rmse])
def test_map_wrapper(self): npoints = 2 nparams = 3 from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False from neuronunit.optimization import exhaustive_search from neuronunit.optimization.optimization_management import map_wrapper grid_points = exhaustive_search.create_grid(npoints = npoints,nparams = nparams) b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute()) assert dtcpop is not None dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,grid_points[0:2]) for i,j in enumerate(dtcpop): for k,v in dtcpop_compare[i].attrs.items(): print(k,v,i,j) self.assertEqual(j.attrs[k],v) return True
def update_dtc_pop(pop, td=None, backend=None): ''' inputs a population of genes/alleles, the population size MU, and an optional argument of a rheobase value guess outputs a population of genes/alleles, a population of individual object shells, ie a pickleable container for gene attributes. Rationale, not every gene value will result in a model for which rheobase is found, in which case that gene is discarded, however to compensate for losses in gene population size, more gene samples must be tested for a successful return from a rheobase search. If the tests return are successful these new sampled individuals are appended to the population, and then their attributes are mapped onto corresponding virtual model objects. ''' import copy import numpy as np from deap import base toolbox = base.Toolbox() pop = [toolbox.clone(i) for i in pop] def transform(ind): import dask.bag as db from neuronunit.optimization.data_transport_container import DataTC dtc = DataTC() import neuronunit LEMS_MODEL_PATH = str( neuronunit.__path__[0]) + str('/models/NeuroML2/LEMS_2007One.xml') if backend is not None: dtc.backend = backend else: dtc.backend = 'NEURON' dtc.attrs = {} for i, j in enumerate(ind): dtc.attrs[str(td[i])] = j dtc.evaluated = False return dtc if len(pop) > 1: b = db.from_sequence(pop, npartitions=8) dtcpop = list(db.map(transform, b).compute()) else: # In this case pop is not really a population but an individual # but parsimony of naming variables # suggests not to change the variable name to reflect this. dtcpop = list(transform(pop)) return dtcpop
def find_rheobase(self, dtc): import dask.bag as db cnt = 0 assert os.path.isfile(dtc.model_path), "%s is not a file" % dtc.model_path # If this it not the first pass/ first generation # then assume the rheobase value found before mutation still holds until proven otherwise. # dtc = check_current(model.rheobase,dtc) # If its not true enter a search, with ranges informed by memory cnt = 0 while dtc.boolean == False: dtc_clones = [ dtc for s in dtc.current_steps ] b0 = db.from_sequence(dtc.current_steps, npartitions=8) b1 = db.from_sequence(dtc_clones, npartitions=8) dtcpop = list(db.map(check_current,b0,b1).compute()) for dtc_clone in dtcpop: dtc.lookup.update(dtc_clone.lookup) dtc = check_fix_range(dtc) cnt += 1 return dtc
def grid_points(): npoints = 2 nparams = 10 from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False electro_path = 'pipe_tests.p' import pickle assert os.path.isfile(electro_path) == True with open(electro_path, 'rb') as f: electro_tests = pickle.load(f) from neuronunit.optimization import exhaustive_search grid_points = exhaustive_search.create_grid(npoints=npoints, nparams=nparams) import dask.bag as db b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list(db.map(exhaustive_search.update_dtc_grid, b0).compute()) assert dtcpop is not None return dtcpop
def main(): # noqa: D103 tscc_vcf_dir = f'{ukb}/../../resources/datasets/ukbiobank/exome/fe_crams' vcf_dir = tscc_vcf_dir bulk_floc = f'{ukb}/exome/fe_cram.bulk' assert os.path.exists(vcf_dir) assert os.path.exists(bulk_floc) current_files = set(os.listdir(tscc_vcf_dir)) # why not retry? nothing else I can do dask.config.set({'distributed.scheduler.allowed-failures': 99}) # Maximum of 10 concurrent downloads per application # See here: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=644 client = dask.distributed.Client( n_workers=10, local_directory="/oasis/tscc/scratch/jmargoli") jobs = [] # calculate number of download batches with open(bulk_floc) as bulk_file: for line in bulk_file: sample_ID, field_ID = line.split() if field_ID == '23163_0_0': suffix = 'cram' elif field_ID == '23164_0_0': suffix = 'cram.crai' file_name = f"{sample_ID}_{field_ID}.{suffix}" if file_name in current_files: continue jobs.append((ukb, sample_ID, field_ID, vcf_dir)) print(f"Number of jobs queued: {len(jobs)}", flush=True) bag = dask.bag.from_sequence(jobs) downloads = bag.map(download_item) client.compute(downloads, retries=99).result() # wait for the result so
def kull(pop): dtcpop = list(update_dtc_pop(pop, td)) dtcpop = list(map(dtc_to_rheo, dtcpop)) dtcpop = list(filter(lambda dtc: dtc.rheobase['value'] > 0.0, dtcpop)) while len(dtcpop) < len(pop): dtcpop.append(dtcpop[0]) dtcpop = list(map(format_test, dtcpop)) b = db.from_sequence(dtcpop, npartitions=8) dtcpop = list(db.map(nunit_evaluation, b, error_criterion).compute()) dtcpop = list( filter( lambda dtc: not isinstance(dtc.scores['RheobaseTestP'], type(None)), dtcpop)) dtcpop = list( filter(lambda dtc: not type(None) in (list(dtc.scores.values())), dtcpop)) dtcpop = list( filter( lambda dtc: not (numpy.isinf(x) for x in list(dtc.scores.values())), dtcpop)) return dtcpop
def apply_for_each_run_dir(self, action, client, status=Status.ENCODED): """ For each run in this Campaign's run list, apply the specified action (an object of type Action) Parameters ---------- action : the action to be applied to each run directory The function to be applied to each run directory. func() will be called with the run directory path as its only argument. client : a Dask client associated with a cluster you want to run your jobs on. Returns ------- """ run_dirs = [] for run_id, run_data in self.campaign_db.runs( status=status, app_id=self._active_app['id']): run_dirs.append(run_data['run_dir']) bag = dask.bag.from_sequence(run_dirs) future = client.compute(bag.map(action.act_on_dir)) future.result()
def test_bag_map(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a=1, b=2, c=3): return a + b + c assert db.map(myadd, b).compute() == list(map(myadd, x)) assert db.map(myadd, a=b).compute() == list(map(myadd, x)) assert db.map(myadd, b, b2).compute() == list(map(myadd, x, x2)) assert db.map(myadd, b, 10).compute() == [myadd(i, 10) for i in x] assert db.map(myadd, 10, b=b).compute() == [myadd(10, b=i) for i in x] sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)] assert db.map(myadd, b, b=b2, c=100).compute() == sol sol = [myadd(i, c=100) for (i, j) in zip(x, x2)] assert db.map(myadd, b, c=100).compute() == sol x_sum = sum(x) sol = [myadd(x_sum, b=i, c=100) for i in x2] assert db.map(myadd, b.sum(), b=b2, c=100).compute() == sol sol = [myadd(i, b=x_sum, c=100) for i in x2] assert db.map(myadd, b2, b.sum(), c=100).compute() == sol sol = [myadd(a=100, b=x_sum, c=i) for i in x2] assert db.map(myadd, a=100, b=b.sum(), c=b2).compute() == sol a = dask.delayed(10) assert db.map(myadd, b, a).compute() == [myadd(i, 10) for i in x] assert db.map(myadd, b, b=a).compute() == [myadd(i, b=10) for i in x] # Mispatched npartitions fewer_parts = db.from_sequence(range(100), npartitions=5) with pytest.raises(ValueError): db.map(myadd, b, fewer_parts) # No bags with pytest.raises(ValueError): db.map(myadd, b.sum(), 1, 2) # Unequal partitioning unequal = db.from_sequence(range(110), npartitions=10) with pytest.raises(ValueError): db.map(myadd, b, unequal, c=b2).compute() with pytest.raises(ValueError): db.map(myadd, b, b=unequal, c=b2).compute()
def build_index(use_bag=False): """ An experiment is a collection of outputNNN directories. Each directory represents the output of a single job submission script. These directories are created by the *payu* tool. This function creates and/or updates an index cache of variables names found in all NetCDF4 files. We can also examine the .nc files directly to infer their contents. for each .nc file, get variables -> dimensions .ncfile, varname, dimensions, chunksize """ # Build index of all NetCDF files found in directories to search. ncfiles = [] runs_available = [] print('Finding runs on disk...', end='') for directoryToSearch in directoriesToSearch: #print('Searching {}'.format(directoryToSearch)) # find all subdirectories results = subprocess.check_output(['find', directoryToSearch, '-maxdepth', '3', '-type', 'd', '-name', 'output???']) results = [s for s in results.decode('utf-8').split()] runs_available.extend(results) print('found {} run directories'.format( len(runs_available))) #ncfiles.extend(results) # # results = subprocess.check_output(['find', directoryToSearch, '-name', '*.nc']) # # print('Found {} .nc files'.format(len(ncfiles))) # We can persist this index by storing it in a sqlite database placed in a centrally available location. # The use of the `dataset` module hides the details of working with SQL directly. # In this database is a single table listing all variables in NetCDF4 seen previously. print('Using database {}'.format(database_url)) print('Querying database...', end='') db = dataset.connect(database_url) # find list of all run directories r = db.query('SELECT DISTINCT rootdir, configuration, experiment, run FROM ncfiles') runs_already_seen = [os.path.join(*row.values()) for row in r] print('runs already indexed: {}'.format(len(runs_already_seen))) runs_to_index = list(set(runs_available) - set(runs_already_seen)) if len(runs_to_index) == 0: print("No new runs found.") return print('{} new run directories found including...'.format(len(runs_to_index))) for i in range(min(3, len(runs_to_index))): print(runs_to_index[i]) if len(runs_to_index) > 3: print('...') print('Finding files on disk...') ncfiles = [] for run in tqdm.tqdm_notebook(runs_to_index, leave=True): results = subprocess.check_output(['find', run, '-name', '*.nc']) results = [s for s in results.decode('utf-8').split()] ncfiles.extend(results) IPython.display.clear_output(wait=True) # NetCDF files found on disk not seen before: #files_to_add = set(ncfiles) - set(files_already_seen) files_to_add = ncfiles print('Files found but not yet indexed: {}'.format(len(files_to_add))) # For these new files, we can determine their configuration, experiment, and run. # Using NetCDF4 to get list of all variables in each file. # output* directories # match the parent and grandparent directory to configuration/experiment find_output = re.compile('(.*)/([^/]*)/([^/]*)/(output\d+)/.*\.nc') # determine general pattern for ncfile names find_basename_pattern = re.compile('(?P<root>[^\d]+)(?P<index>__\d+_\d+)?(?P<indexice>\.\d+\-\d+)?(?P<ext>\.nc)') def index_variables(ncfile): matched = find_output.match(ncfile) if matched is None: return [] if not os.path.exists(ncfile): return [] basename = os.path.basename(ncfile) m = find_basename_pattern.match(basename) if m is None: basename_pattern = basename else: basename_pattern = m.group('root') + ('__\d+_\d+' if m.group('index') else '') + ('.\d+-\d+' if m.group('indexice') else '')+ m.group('ext') try: with netCDF4.Dataset(ncfile) as ds: ncvars = [ {'ncfile': ncfile, 'rootdir': matched.group(1), 'configuration': matched.group(2), 'experiment' : matched.group(3), 'run' : matched.group(4), 'basename' : basename, 'basename_pattern' : basename_pattern, 'variable' : v.name, 'dimensions' : str(v.dimensions), 'chunking' : str(v.chunking()), } for v in ds.variables.values()] except: print ('Exception occurred while trying to read {}'.format(ncfile)) ncvars = [] return ncvars if len(files_to_add) == 0: print("No new .nc files found.") return True print('Indexing new .nc files...') if use_bag: with distributed.Client() as client: bag = dask.bag.from_sequence(files_to_add) bag = bag.map(index_variables).flatten() futures = client.compute(bag) progress(futures, notebook=False) ncvars = futures.result() else: ncvars = [] for file_to_add in tqdm.tqdm_notebook(files_to_add, leave=False): ncvars.extend(index_variables(file_to_add)) IPython.display.clear_output() print('') print('Found {} new variables'.format(len(ncvars))) print('Saving results in database...') db['ncfiles'].insert_many(ncvars) print('Indexing complete.') return True