def handle(self, *args, **options): # Unpack variables name = options['name'] model = options['model'] segmentation = options['segmentation'] spatial_aggregation = options['spatial_aggregation'] categorical_variables = options['categorical_variables'] scheduler_file = options['scheduler'] # datacube query gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region']} iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client(scheduler_file=scheduler_file) client.restart() C = client.map(predict_object, iterable, pure=False, **{'model_name': model, 'segmentation_name': segmentation, 'categorical_variables': categorical_variables, 'aggregation': spatial_aggregation, 'name': name, }) result = client.gather(C) print('Successfully ran prediction on %d tiles' % sum(result)) print('%d tiles failed' % result.count(False))
def setup_cluster(config: cpb.ConstructConfig, faiss_index_path: Path) -> None: # Connect if config.cluster.run_locally: print("Running on local machine!") dask_client = None else: cluster_address = f"{config.cluster.address}:{config.cluster.port}" print("Configuring Dask, attaching to cluster") print(f"\t- {cluster_address}") dask_client = Client(address=cluster_address) if config.cluster.restart: print("\t- Restarting cluster...") dask_client.restart() print(f"\t- Running on {len(dask_client.nthreads())} machines.") # Initialize Helper Objects on each worker preloader = dpg.WorkerPreloader() preloader.register(*text_util.get_scispacy_initalizer( scispacy_version=config.parser.scispacy_version, )) preloader.register(*text_util.get_stopwordlist_initializer( stopword_path=config.parser.stopword_list)) preloader.register(*embedding_util.get_pytorch_device_initalizer( disable_gpu=config.sys.disable_gpu, )) preloader.register(*embedding_util.get_bert_initializer( bert_model=config.parser.bert_model, )) # This actual file path will need to be created during the pipeline before use preloader.register(*knn_util.get_faiss_index_initializer( faiss_index_path=faiss_index_path, )) # If semrep is installed and congiured with agatha if (config.semrep.HasField("semrep_install_dir") and config.semrep.HasField("metamap_install_dir")): preloader.register(*semrep_util.get_metamap_server_initializer( metamap_install_dir=config.semrep.metamap_install_dir)) dpg.add_global_preloader(client=dask_client, preloader=preloader)
def handle(self, *args, **options): # Unpack variables model_id = options['model_id'] out_dir = options['out_dir'] # Create output dir if does not exist if not os.path.exists(out_dir): os.makedirs(out_dir) # datacube query gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region'] } iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client() client.restart() C = client.map(predict_pixel_tile, iterable, **{ 'model_id': model_id, 'outdir': out_dir }) filename_list = client.gather(C) print(filename_list)
def connect_to_dask_cluster(config: cpb.AbstractGeneratorConfig) -> None: # Potential cluster if config.cluster.run_locally or config.cluster.address == "localhost": print("Running dask on local machine!") else: cluster_address = f"{config.cluster.address}:{config.cluster.port}" print("Configuring Dask, attaching to cluster") print(f"\t- {cluster_address}") dask_client = Client(address=cluster_address) if config.cluster.restart: print("\t- Restarting cluster...") dask_client.restart()
def connnect_glue(): # os.system('dask-ssh 128.104.222.{103,104,105,107}') # subprocess.call('dask-ssh', '128.104.222.{103,104,105,107}') # time.sleep(10) import numpy as np client = Client('128.104.222.103:8786') client.restart() x = da.from_zarr('/mnt/cephfs/smltar_numpyarr/zarr_data_full') print(x) # y = x[0:1] # z = x[100:101] # m = x[1000:1001] # n = x[1500:1501] # p = x[1400:1401] y = x[0:30] z = x[100:130] m = x[1000:1030] n = x[1500:1530] p = x[1400:1430] # zc = x[108:208] # mc = x[1008:1108] # nc = x[1508:1608] # pc = x[1601:1701] # sum = (y + z - m + p) * n # # sum2 = (zc + mc + nc +pc)*sum # # sum3 = sum2 + (zc + mc + nc +pc)*sum # # # # print(sum2) # sum.visualize('sum3') # frm = sum[15] fu = client.compute(sum) # p = r.result() # print(type(p)) # return p re = fu.result() re = np.array(re) np.save("/mnt/cephfs/result/test", re[15]) # print(p) return re[15]
def handle(self, *args, **options): # Unpack variables algorithm = options['algorithm'] bands = options['bands'] name = options['name'] product_pre = options['product_pre'] product_post = options['product_post'] lc_pre = options['lc_pre'] lc_post = options['lc_post'] year_pre = options['year_pre'] year_post = options['year_post'] filter_labels = options['filter_labels'] mmu = options['mmu'] extra_args = parser_extra_args(options['extra_kwargs']) scheduler_file = options['scheduler'] # Build segmentation meta object meta, _ = ChangeInformation.objects.get_or_create(year_pre=year_pre, year_post=year_post, algorithm=algorithm, name=name) # Build gwf_kwargs, send a query for both products, combine the dict and generate iterable gwf_kwargs = {k: options[k] for k in ['lat', 'long', 'region']} pre_dict = gwf_query(product_pre, view=False, **gwf_kwargs) post_dict = gwf_query(product_post, view=False, **gwf_kwargs) iterable = join_dicts(pre_dict, post_dict, join='inner').items() # Start cluster and run client = Client(scheduler_file=scheduler_file) client.restart() C = client.map(detect_and_classify_change, iterable, pure=False, **{ 'algorithm': algorithm, 'change_meta': meta, 'band_list': bands, 'mmu': mmu, 'lc_pre': lc_pre, 'lc_post': lc_post, 'extra_args': extra_args, 'filter_labels': filter_labels }) result = client.gather(C) print('Successfully ran change detection on %d tiles' % sum(result)) print('%d tiles failed' % result.count(False))
def handle(self, *args, **options): # Unpack variables product = options['product'] algorithm = options['algorithm'] extra_args = parser_extra_args(options['extra_kwargs']) bands = options['bands'] datasource = options['datasource'] year = options['year'] name = options['name'] scheduler_file = options['scheduler'] # Build segmentation meta object meta, _ = SegmentationInformation.objects.get_or_create( algorithm=algorithm, datasource=datasource, parameters=json.dumps(extra_args), datasource_year=year, name=name, ) # datacube query gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region'] } iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client(scheduler_file=scheduler_file) client.restart() C = client.map(segment, iterable, pure=False, **{ 'algorithm': algorithm, 'segmentation_meta': meta, 'band_list': bands, 'extra_args': extra_args }) result = client.gather(C) print('Successfully ran segmentation on %d tiles' % sum(result)) print('%d tiles failed' % result.count(False))
class RunDirectory: """Open data in experiment folder.""" weightfile = None griddes = None def __enter__(self): """ Create enter method. The enter method just returns the object it self. It is used to work along the with __exit__ method that closes a distributed worker. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Close the distributed client befor exiting.""" self.close_client() def __init__(self, run_dir, *, prefix=None, model_type=None, overwrite=False, f90name_list=None, filetype='nc', client=None): """ Create an RunDirecotry object from a given input directory. :: run = RunDirectory('/work/mh0066/precip-project/3-hourly/CMORPH') The RunDirectory object gathers all nesseccary information on the data that is stored in the run directory. Once loaded the most important meta data will be stored in the run directory for faster access the second time. Parameters ---------- run_dir: str Name of the directory where the data that should be read is stored. prefix: str, optional (default: None) filname prefix model_type: str, optional (default: None) model name/ observation porduct that created the data. This will be used to generate a variable lookup table. This can be useful for loading various model datasets and comparing them while only accessing the data with one set of variable names. By default no lookupt table will be generated. overwrite: bool, optional (default : False) If true the meta data will be generated again even if it has been stored to disk already. f90name_list: str, optional (default: None) Filename to an optional f90 namelist with additional information about the data filetype: str, optional (default: nc) Input data file format client: dask.distributed cleint, optional (default: None) Configuration that is used the create a dask client which recieves tasks for multiproccessing. By default (None) a local client will be started. """ if isinstance(client, Client): self.dask_client = client else: self.dask_client = Client(client) self.prefix = prefix or '' self.variables = lookup(model_type) run_dir = op.abspath(str(run_dir)) nml_file = f90name_list or 'NAMELIST_{}*'.format(prefix) info_file = self._hash_file(run_dir) if overwrite or not info_file.is_file(): self.name_list = {} for nml_file in Path(run_dir).rglob(nml_file): self.name_list = { **self.name_list, **f90nml.read(str(run_dir / nml_file)) } self.name_list['output'] = self._get_files(run_dir, filetype) self.name_list['weightfile'] = None self.name_list['gridfile'] = self.griddes self.name_list['run_dir'] = op.abspath(str(run_dir)) self._dump_json(run_dir) else: with open(str(info_file), 'r') as f: self.name_list = json.load(f) @staticmethod def _hash_file(run_dir): run_dir = op.expanduser(str(run_dir)) hash_obj = hashlib.md5(op.abspath(run_dir).encode()) hash_str = str(hash_obj.hexdigest()) return _cache_dir / Path('run_info_{}.json'.format(hash_str)) @staticmethod def _get_files(run_dir, extensions): """Get all netcdf filenames.""" ext_str = ''.join( ['[{}{}]'.format(l.lower(), l.upper()) for l in extensions]) pat = re.compile('^(?!.*restart|.*remap).*{}'.format(ext_str)) glob_pad = '*.{}'.format(ext_str) result = sorted([ f.as_posix() for f in Path(run_dir).rglob(glob_pad) if re.match(pat, f.as_posix()) ]) return result @staticmethod def _remap(infile, out_dir=None, griddes=None, weightfile=None, method=None, gridfile=None, options=None): options = options or '-f nc4' if isinstance(infile, (str, Path)): infile = Path(infile) out_file = str(Path(out_dir) / infile.with_suffix('.nc').name) else: out_file = None with NamedTemporaryFile(dir=out_dir, suffix='.nc') as tf_in: if method == 'weighted': cdo_str = str(griddes) + ',' + str(weightfile) remap_func = getattr(cdo, 'remap') else: cdo_str = str(griddes) remap_func = getattr(cdo, method) if gridfile is not None: cdo_str += ' -setgrid,' + str(gridfile) if isinstance(infile, xr.DataArray): _ = xr.Dataset(data_vars={ infile.name: infile }).to_netcdf(tf_in.name) kwargs = dict(returnXArray=infile.name) infile = Path(tf_in.name) elif isinstance(infile, xr.Dataset): _ = infile.to_netcdf(tf_in.name) infile = Path(tf_in.name) kwargs = dict(returnXDataset=True) else: kwargs = dict(output=str(out_file), options=options) out = remap_func('{} {}'.format(str(cdo_str), str(infile)), **kwargs) try: return out.compute() except AttributeError: return out @property def run_dir(self): """Get the name of the experiment path.""" return Path(self.name_list['run_dir']) @property def files(self): """Return all files that have been opened.""" return pd.Series(self.name_list['output']) @staticmethod def apply_function(mappable, collection, *, args=None, client=None, **kwargs): """ Apply function to given collection. :: result = run.apply_function(lambda d, v: d[v].sum(dim='time'), run.dataset, args=('temp',)) Parameters ---------- mappable: method method that is applied collection: collection collection that is distributed in a thread pool args: additional arguments passed into the method client: dask distributed client (default: None) worker scheduler client that submits the jobs. If None is given a new client is started progress: bool (default: True) display tqdm progress bar **kwargs: optional additional keyword arguments controlling the progress bar parameter Returns ------- combined output of the thread-pool processes: collection """ client = client or Client() args = args or () if isinstance(collection, (xr.DataArray, xr.Dataset)): tasks = [(client.scatter(collection), *args)] else: tasks = [(client.scatter(entry), *args) for entry in collection] futures = [client.submit(mappable, *task) for task in tasks] progress = kwargs.pop('progress', True) if progress is True: progress_bar(futures, **kwargs) output = client.gather(futures) if len(output) == 1: # Possibly only one job was submitted return output[0] return output def close_client(self): """Close the opened dask client.""" self.dask_client.close() def restart_client(self): """Restart the opened dask client.""" self.dask_client.restart() @property def status(self): """Query the status of the dask client.""" return self.dask_client.status def remap(self, grid_description, inp=None, out_dir=None, *, method='weighted', weightfile=None, options='-f nc4', grid_file=None): """ Regrid to a different input grid. :: run.remap('echam_griddes.txt', method='remapbil') Parameters ---------- grid_description: str Path to file containing the output grid description inp: (collection of) str, xarray.Dataset, xarray.DataArray Filenames that are to be remapped. out_dir: str (default: None) Directory name for the output weight_file: str (default: None) Path to file containing grid weights method: str (default: weighted) Remap method that is applyied to the data, can be either weighted (default), bil, con, laf, nn. If weighted is chosen this class should have been instanciated either with a given weightfile or using the gen_weights methods. weightfile: str (default: None) File containing the weights for the distance weighted remapping. grid_file: str (default: None) file containing the source grid describtion options: str (default: -f nc4) additional file options that are passed to cdo Returns ------- Collection of output: (str, xarray.DataArray, xarray.Dataset) """ out_dir = out_dir or TemporaryDirectory().name Path(out_dir).absolute().mkdir(exist_ok=True, parents=True) impl_methods = ('weighted', 'remapbil', 'remapcon', 'remaplaf', 'remapnn') weightfile = weightfile or self.weightfile if method not in impl_methods: raise NotImplementedError('Method not available.' ' Currently implemented' ' methods are:' 'weighted, remapbil, ' 'remapcon, remaplaf, remapnn') if weightfile is None and method == 'weighted': raise ValueError('No weightfile was given, either choose different' ' remapping method or instanciated the Reader' ' object by providing a weightfile or generate ' 'a weightfile by calling the gen_weights methods') args = (Path(out_dir), grid_description, weightfile, method, grid_file, options) run_dir = self.name_list['run_dir'] if inp is None: inp = self.files elif isinstance(inp, (str, Path)): if not Path(inp).is_file(): inp = sorted([f for f in Path(run_dir).rglob(inp)]) else: inp = (inp, ) if len(inp) == 0: raise FileNotFoundError('No files for remapping found') return self.apply_function(self._remap, inp, args=args, client=self.dask_client, label='Remapping') def _dump_json(self, run_dir): run_dir = op.abspath(str(run_dir)) info_file = self._hash_file(run_dir) name_list = self.name_list name_list['run_dir'] = run_dir name_list['json_file'] = str(info_file.absolute()) with open(str(info_file), 'w') as f: json.dump(name_list, f, sort_keys=True, indent=4) @classmethod def gen_weights(cls, griddes, run_dir, *, prefix=None, model_type='ECHAM', infile=None, overwrite=False, client=None): """ Create grid weigths from grid description and instanciate class. :: run = RunDirectory.gen_weights('echam_grid.txt', '/work/mh0066/precip-project/3-hourly/CMORPH/', infile='griddes.nc') Parameters ---------- griddess: str filename containing the desired output grid information run_dir: str path to the experiment directory prefix: str filename prefix model_type: str Model/Product name of the dataset to be read infile: str Path to input file. By default the method looks for appropriate inputfiles overwrite: bool, optional (default: False) should an existing weight file be overwritten Returns ------- RunDirectory: RunDirectory object """ try: out_file = [f for f in Path(run_dir).absolute().rglob('*2d*.nc')][0] except IndexError: try: out_file = [f for f in Path(run_dir).absolute().rglob('*.nc')][0] except IndexError: raise FileNotFoundError('Run Directory is empty') def get_input(rundir, inp_file): for file in (inp_file, op.join(rundir, 'o3_icon_DOM01.nc'), op.join(rundir, 'bc_ozone.nc')): if op.isfile(str(file)): return inp_file input_file = get_input(run_dir, infile) weight_file = op.abspath(op.join(run_dir, 'remapweights.nc')) if overwrite or not os.path.isfile(weight_file): cmd = '{} -setgrid,{} {}'.format(op.abspath(griddes), input_file, out_file) weight_file = cdo.gendis(cmd, output=weight_file) cls.gridfile = griddes cls.weightfile = op.abspath(weight_file) return cls(run_dir, prefix=prefix, model_type=model_type, overwrite=overwrite, client=client) def load_data(self, filenames=None, **kwargs): """ Open a multifile dataset using xrarray open_mfdataset. :: dset = run.load_data('*2008*.nc') Parameters ---------- filenames: collection/str collection of filenames, filename or glob pattern for filenames that should be read. Default behavior is reading all dataset files **kwargs: optional Additional keyword arguments passed to xarray's open_mfdataset Returns ------- Xarray (multi-file) dataset: xarray.Dataset """ filenames = self._get_files_from_glob_pattern(filenames) or self.files kwargs.setdefault('parallel', True) kwargs.setdefault('combine', 'by_coords') return xr.open_mfdataset(filenames, **kwargs) def _get_files_from_glob_pattern(self, filenames): """Construct filename to read.""" if isinstance(filenames, (str, Path)): ncfiles = [ filenames, ] elif filenames is None: return None else: ncfiles = list(filenames) read_files = [] for in_file in ncfiles: if op.isfile(in_file): read_files.append(str(in_file)) else: read_files += [ str(f) for f in self.run_dir.rglob(str(in_file)) ] return sorted(read_files)
def Assignment1B(user_reviews_csv, products_csv): client = Client('127.0.0.1:8786') client = client.restart() # defining data types reviews_dtypes = { 'reviewerID': np.str, 'asin': np.str, 'reviewerName': np.str, 'helpful': np.object, 'reviewText': np.str, 'overall': np.float64, 'summary': np.str, 'unixReviewTime': np.float64, 'reviewTime': np.str } products_dtypes = { 'asin': np.str, 'salesRank': np.object, 'imUrl': np.str, 'categories': np.object, 'title': np.str, 'description': np.str, 'price': np.float64, 'related': np.object, 'brand': np.str } # instantiating dataframes as variables products = dd.read_csv(products_csv, dtype=products_dtypes) reviews = dd.read_csv(user_reviews_csv, dtype=reviews_dtypes) ### Question 1 ### # percentage of missing values for all columns in the reviews table and the products table products_missing_perc = np.mean(products.isnull()) * 100 reviews_missing_perc = np.mean(reviews.isnull()) * 100 ### Question 2 ### # using only the columns we need to join on reviews_sub = reviews[['asin', 'overall']] products_sub = products[['asin', 'price']] # declaring types for no typeerrors reviews_sub['asin'] = reviews_sub['asin'].astype(str) products_sub['asin'] = products_sub['asin'].astype(str) # joining the dataframes and calculating the pearson correlation merged_df = dd.merge(products_sub, reviews_sub, on='asin') pearson_correlation = merged_df[['price', 'overall']].corr() pearson_correlation = pearson_correlation['price'] ### Question 3 ### # calculating the descriptive statistics descriptive_stats = products['price'].describe() ### Question 4 ### # aggregating over the categories column super_category = products['categories'].apply(get_super_category, meta='str').value_counts() # parallelizing the individual questions q1a, q1b, q2, q3, q4, product_asin = dd.compute( products_missing_perc, reviews_missing_perc, pearson_correlation, descriptive_stats, super_category, products.asin) # converting each question to the correct format for writing into json q1a = q1a.round(2).to_dict() q1b = q1b.round(2).to_dict() q2 = q2['overall'].round(2) q3 = q3.round(2)[['mean', 'std', '50%', 'min', 'max']].to_dict() q4 = q4.to_dict() ### Question 5 ### # check if the review ids are in the computed product ids product_is_not_dangling = reviews.asin.isin(product_asin) if all(product_is_not_dangling) == True: q5 = 0 else: q5 = 1 ### Question 6 ### # extract just the related column as a dataframe products_related = products[['related']] # aggregate over just the related column as a series products_related['related'] = products_related.related.apply(get_related, meta='array') # get the list of product ids separated into individual values using .explode() asins = products_related.explode('related') # check if the list of product ids are in the computed product ids asin_is_not_dangling = asins.related.isin(product_asin) if all(asin_is_not_dangling) == True: q6 = 0 else: q6 = 1 # correct format according to PA1 writeup submit = { 'q1': { 'products': q1a, 'reviews': q1b }, 'q2': q2, 'q3': q3, 'q4': q4, 'q5': q5, 'q6': q6 } with open('results_PA1.json', 'w') as outfile: json.dump(submit, outfile)
print( "Running pymoliere sentence_classifier with the following parameters:" ) print(config) # Potential cluster if config.cluster.run_locally or config.cluster.address == "localhost": print("Running on local machine!") else: cluster_address = f"{config.cluster.address}:{config.cluster.port}" print("Configuring Dask, attaching to cluster") print(f"\t- {cluster_address}") dask_client = Client(address=cluster_address) if config.cluster.restart: print("\t- Restarting cluster...") dask_client.restart() # Need to make sure model_path is writable model_path.parent.mkdir(parents=True, exist_ok=True) # We're going to store model-specific checkpoints separately data_ckpt_dir.mkdir(parents=True, exist_ok=True) # All data, this is the checkpoint we depend on sentences_with_embedding = file_util.load( default_ckpt_dir.joinpath("sentences_with_embedding")) # Get only results with labels, store at TrainingData tuples all_data = sentences_with_embedding.map_partitions( filter_sentences_with_embedding) print("Checkpoint: all_data") checkpoint( all_data,
class dask_controller: #adapted from Charles' code def __init__(self,n_workers=6,local=True,queue="short",\ walltime='01:30:00',cores=1,processes=1,memory='6GB',job_extra=[]): self.local = local self.n_workers = n_workers self.walltime = walltime self.queue = queue self.processes = processes self.memory = memory self.cores = cores self.job_extra = job_extra def writedir(self, directory): if not os.path.exists(directory): os.makedirs(directory) def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,walltime=self.walltime,\ processes=self.processes,memory=self.memory, cores=self.cores,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster) def shutdown(self): self.daskcluster.stop_all_jobs() for item in os.listdir("./"): if "worker-" in item or "slurm-" in item or ".lock" in item: path = "./" + item if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) def printprogress(self): complete = len( [item for item in self.futures if item.status == "finished"]) print(str(complete) + "/" + str(len(self.futures))) def mapfovs(self, function, fov_list, retries=0): self.function = function self.retries = retries def mapallfovs(fov_number, function=function): function(fov_number) self.futures = {} for fov in fov_list: future = self.daskclient.submit(mapallfovs, fov, retries=retries) self.futures[fov] = future def retry_failed(self): self.failed_fovs = [ fov for fov, future in self.futures.items() if future.status != 'finished' ] self.daskclient.restart() time.sleep(5) self.mapfovs(self.function, self.failed_fovs, retries=self.retries) def retry_processing(self): self.proc_fovs = [ fov for fov, future in self.futures.items() if future.status == 'pending' ] self.daskclient.restart() time.sleep(5) self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
class dask_controller: #adapted from Charles' code def __init__(self,n_workers=6,local=True,queue="short",death_timeout=3.,\ walltime='01:30:00',cores=1,processes=1,memory='6GB',\ working_directory="./",job_extra=[]): self.local = local self.n_workers = n_workers self.walltime = walltime self.queue = queue self.death_timeout = death_timeout self.processes = processes self.memory = memory self.cores = cores self.working_directory = working_directory self.job_extra = job_extra writedir(working_directory, overwrite=False) def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\ processes=self.processes,memory=self.memory,\ cores=self.cores,local_directory=self.working_directory,\ log_directory=self.working_directory,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster) def shutdown(self): self.daskclient.restart() if not self.local: self.daskcluster.stop_all_jobs() self.daskcluster.close() for item in os.listdir(self.working_directory): if "worker-" in item or "slurm-" in item or ".lock" in item: path = "./" + item if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) def printprogress(self): complete = len( [item for item in self.futures if item.status == "finished"]) print(str(complete) + "/" + str(len(self.futures))) def displaydashboard(self): link = self.daskcluster.dashboard_link display(HTML('<a href="' + link + '">Dashboard</a>')) def mapfovs(self, function, fov_list, retries=0): self.function = function self.retries = retries def mapallfovs(fov_number, function=function): function(fov_number) self.futures = {} for fov in fov_list: future = self.daskclient.submit(mapallfovs, fov, retries=retries) self.futures[fov] = future def retry_failed(self): self.failed_fovs = [ fov for fov, future in self.futures.items() if future.status != 'finished' ] out = self.daskclient.restart() self.mapfovs(self.function, self.failed_fovs, retries=self.retries) def retry_processing(self): self.proc_fovs = [ fov for fov, future in self.futures.items() if future.status == 'pending' ] out = self.daskclient.restart() self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
class LightGBMDaskLocal: # https://github.com/Nixtla/mlforecast/blob/main/nbs/distributed.forecast.ipynb """ persist call: data = self.client.persist(data) (assignment replaces old lazy array, as persist does not change the input in-place) To reduce the risk of hitting memory limits, consider restarting each worker process before running any data loading or training code. self.client.restart() - This function will restart each of the worker processes, clearing out anything they’re holding in memory. This function does NOT restart the actual machines of your cluster, so it runs very quickly. - should the workers just be killed regardless of whether the whole process was successful or unsuccessful (sort of a clean up action)? can restarting be that cleanup action? loop over hyperparameter values (method that accepts hyperparameters as a dictionary - initializes self.model = DaskLGBMRegressor() with each set of parameters and calls the method that loops over ) loop over train-valdation sets run model's fit method and compute predicted values and RMSE """ def __init__( self, curr_dt_time, n_workers, s3_path, startmonth, n_months_in_first_train_set, n_months_in_val_set, frac=None, ): self.curr_dt_time = curr_dt_time self.startmonth = startmonth self.n_months_in_first_train_set = n_months_in_first_train_set self.n_months_in_val_set = n_months_in_val_set self.frac = frac if frac is not None else 1.0 cluster = LocalCluster(n_workers=n_workers) self.client = Client(cluster) self.client.wait_for_workers(n_workers) print(f"***VIEW THE DASHBOARD HERE***: {cluster.dashboard_link}") # self.pca_transformed = ___ # call PCA code that returns numpy array here # (rename self.pca_transformed to self.full_dataset) # numpy array can also be created from the saved (pickle) file # for data: # instead of first looping over hyperparameter values and then over different # train-validation sets, is it better to do it in the opposite order # to allow for one set of train-validation data to be created only once? try: # this commented out code did not work without the meta= argument, # meta= was not tried as it needs all other columns listed, in # addition to the ones being recast # self.full_dataset = self.client.persist( # dd.read_parquet( # s3_path, index=False, engine="pyarrow" # ) # .sample(frac=self.frac, random_state=42) # .map_partitions( # self.cast_types, # meta={ # 'sid_shop_item_qty_sold_day': 'i2', # **{f'cat{n}': 'i2' for n in range(1,23)} # } # ) # .map_partitions(self.drop_neg_qty_sold) # .set_index( # "sale_date", sorted=False, npartitions="auto" # ) # .repartition(partition_size="100MB") # ) # create Dask dataframe from partitioned Parquet dataset on S3 and persist it to cluster self.full_dataset = dd.read_parquet(s3_path, index=False, engine="pyarrow").sample( frac=self.frac, random_state=42) self.full_dataset["sale_date"] = self.full_dataset[ "sale_date"].astype("datetime64[ns]") self.full_dataset[ "sid_shop_item_qty_sold_day"] = self.full_dataset[ "sid_shop_item_qty_sold_day"].astype("int16") for col in self.full_dataset: if col.startswith("cat"): self.full_dataset[col] = self.full_dataset[col].astype( "int16") logging.debug( f"# of rows in full dataframe before removal of negative target values: {len(self.full_dataset)}" ) self.full_dataset = self.full_dataset[ self.full_dataset.sid_shop_item_qty_sold_day >= 0] # call dataframe.set_index(), then repartition, then persist # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.set_index.html # set_index(sorted=False, npartitions='auto') # df = df.repartition(npartitions=df.npartitions // 100) # self.full_dataset = self.client.persist(self.full_dataset) # _ = wait([self.full_dataset]) # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html # self.full_dataset = self.full_dataset.repartition(partition_size="100MB") self.full_dataset = self.full_dataset.set_index( "sale_date", sorted=False, npartitions="auto", partition_size=100_000_000, ) # partition_size for set_index: int, optional, desired size of # eaach partition in bytes (to be used with npartitions='auto') self.full_dataset = self.cull_empty_partitions(self.full_dataset) self.full_dataset = self.client.persist(self.full_dataset) _ = wait([self.full_dataset]) logging.debug( f"# of rows in full dataframe after removal of negative target values: {len(self.full_dataset)}" ) logging.debug( f"Earliest and latest dates in full dataframe are : {dd.compute(self.full_dataset.index.min(), self.full_dataset.index.max())}" ) logging.debug( f"Data types of full Dask dataframe are: {self.full_dataset.dtypes}" ) except Exception: logging.exception( "Exception occurred while creating Dask dataframe and persisting it on the cluster." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # finally: # self.client.restart() # sys.exit(1) # https://stackoverflow.com/questions/58437182/how-to-read-a-single-large-parquet-file-into-multiple-partitions-using-dask-dask # Parquet datasets can be saved into separate files. # Each file may contain separate row groups. # Dask Dataframe reads each Parquet row group into a separate partition. # I DON'T WANT TO KEEP THE NUMPY ARRAY IN MEMORY, SO IT NEEDS TO BE # DELETED AFTER DASK ARRAY IS CREATED # MIGHT BE BETTER TO CREATE DASK ARRAY FROM FILE ON S3, TO AVOID # HAVING BOTH NUMPY ARRAY AND PERSISTED DASK ARRAY IN MEMORY # I ALSO WANT TO SPLIT THAT NUMPY ARRAY INTO MULTIPLE TRAIN AND VALIDATION # SETS, SO WHAT'S THE BEST WAY TO DO THAT? # SEND THE ENTIRE ARRAY TO THE CLUSTER AT ONCE - PROBABLY NOT, OR # SEND TRAIN AND VALIDATION SETS ONE BY ONE AND DELETE? # BUT THAT WILL REQUIRE SENDING DATA TO THE CLUSTER MULTIPLE TIMES - # NOT IF THE DATA BEING SENT ARE DIFFERENT EACH TIME # THEY ARE NOT GOING TO BE COMPLETELY DIFFERENT BECAUSE TRAIN DATA WILL # JUST CONTINUE TO MERGE WITH VALIDATION SETS AND GROW # CREATE FIRST DASK ARRAY AND SEND TO CLUSTER, THEN APPEND TO IT? # IT DOES NOT LOOK LIKE DASK WOULD ALLOW THAT (SEE # https://github.com/dask/distributed/issues/1676 - # "You should also be aware that the task/data model underlying dask # arrays is immutable. You should never try to modify memory in-place.") # SO PROBABLY SEND ALL OF THE DATA TO THE CLUSTER AT THE BEGINNING, # THEN TAKE CHUNKS OF IT FOR WALK-FORWARD VALIDATION # PROBABLY SHOULD RELY ON LOADING DATA FROM FILE USING DELAYED / # FROM_DELAYED # SEE https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array) # can I use a function to read multiple files into one Dask array? # either figure out how to read multiple files (saved on S3) into one # Dask array, or # figure out how to save one array of PCA results to S3 (need disk space # to save it locally before transfer to S3 and need a method that can # handle transfer of more than 5GB - multipart transfer to S3) # try to write PCA-transformed data directly to zarr array (stored in memory) # then upload it to S3 (directly from memory) # then create dask array from that zarr array in S3 # try to write PCA-transformed data to xarray then upload it to S3 as zarr # save numpy array to parquet file, upload that file to S3 (using upload_file), # then read that file into a Dask dataframe # write data to parquet on S3 from pandas dataframe and append to it using awswrangler library? # (https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/004%20-%20Parquet%20Datasets.ipynb) # df = dd.read_parquet('s3://bucket/my-parquet-data') # (https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html#dask.dataframe.read_parquet) # from above link: # engine argument: If ‘pyarrow’ or ‘pyarrow-dataset’ is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used. # read partitioned parquet dataset with Dask: # https://stackoverflow.com/questions/67222212/read-partitioned-parquet-dataset-written-by-spark-using-dask-and-pyarrow-dataset # def cast_types(self, df): # df = df.copy() # df['sale_date'] = df["sale_date"].astype( # "datetime64[ns]" # ) # for col in df: # if col.startswith("cat") or (col == "sid_shop_item_qty_sold_day"): # df[col] = df[col].astype("int16") # return df # # def drop_neg_qty_sold(self, df): # return df[df.sid_shop_item_qty_sold_day >= 0].copy() # function from https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask def cull_empty_partitions(self, ddf): ll = list(ddf.map_partitions(len).compute()) ddf_delayed = ddf.to_delayed() ddf_delayed_new = list() pempty = None for ix, n in enumerate(ll): if 0 == n: pempty = ddf.get_partition(ix) else: ddf_delayed_new.append(ddf_delayed[ix]) if pempty is not None: ddf = dd.from_delayed(ddf_delayed_new, meta=pempty) return ddf def gridsearch_wfv(self, params): # self.hyperparameters = hyperparameters # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in # the self.hyper_dict dictionary with value containing list of RMSE values self.all_params_combs = list() # determine if there is more than one combination of hyperparameters # if only one combination, set get_stats_ flag to True self.get_stats_ = (len(params[max(params, key=lambda x: len(params[x]))]) == 1) for params_comb_dict in (dict( zip(params.keys(), v)) for v in list(product(*list(params.values())))): # for self.hyper_dict in hyperparameters: # self.params_combs_list.append(params_comb_dict) self.params_comb_dict = params_comb_dict.copy() self.params_comb_dict["rmse_list_"] = list() self.params_comb_dict["monthly_rmse_list_"] = list() self.params_comb_dict["fit_times_list_"] = list() try: self.model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **params_comb_dict, ) except Exception: logging.exception( "Exception occurred while initializing Dask model.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # call method that loops over train-validation sets with performance_report( filename=f"dask_report_{self.curr_dt_time}.html"): for train, test, get_stats in self.train_test_time_split(): self.fit(train).predict(test).rmse_all_folds( test, get_stats) self.params_comb_dict["avg_rmse_"] = mean( self.params_comb_dict["rmse_list_"]) self.params_comb_dict["monthly_avg_rmse_"] = mean( self.params_comb_dict["monthly_rmse_list_"]) self.all_params_combs.append(self.params_comb_dict) best_params = min(self.all_params_combs, key=lambda x: x["monthly_avg_rmse_"]) self.best_score_ = best_params["monthly_avg_rmse_"] # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.) self.best_params_ = { k: v for k, v in best_params.items() if k in params } # save list of parameter-result dictionaries to dataframe and then to CSV if self.all_params_combs: all_params_combs_df = pd.DataFrame(self.all_params_combs) output_csv = "all_params_combs.csv" all_params_combs_df.to_csv(output_csv, index=False) try: key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv" # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_csv, "sales-demand-data", key) logging.info( "Name of CSV uploaded to S3 and containing all parameter combinations " f"and results is: {key}") except ClientError as e: logging.exception( "CSV file with LightGBM parameter combinations and results was not copied to S3." ) else: logging.debug( "List of parameter-result dictionaries is empty and was not converted to CSV!" ) # probably do the opposite: # loop over train-validation splits (persisting that data in memory) # and run different models on one # split, saving the results that can later be aggregated # is it possible to read the full range of dates needed for time # series validation and then drop/delete rows from array or # move some rows to another array: # start with July-September (train) + October (validation), # then remove October and move September from train to validation # def time_split(self): # return ( # self.full_dataset.loc[:self.end_date], # self.full_dataset.loc[self.end_date + timedelta(days=1):self.end_date + relativedelta(months=self.n_months_in_val_set, day=31)] # # self.full_dataset[date > self.end_date & date <= self.end_date + relativedelta(months=n_months_in_val_set, day=31)] # # less than or equal to last day of month currently used for validation # ) def train_test_time_split(self): # first (earliest) month: July 2015 # number of months in first train set: 1 # number of months in validation set: 2 # # number of months between Oct 2015 and July 2015: 3 # 3 - (2 - 1) = 2 (two 2-month intervals inside a 3-month interval) # (where 2 is the number of months in validation set) # (3 - n_months_in_first_train_set + 1) - (2 - 1) n_val_sets = ( month_counter( self.startmonth) # self.startmonth is e.g. July 1, 2015 - self.n_months_in_first_train_set + 1) - (self.n_months_in_val_set - 1) for m in range(n_val_sets): end_date = self.startmonth + relativedelta( months=m + self.n_months_in_first_train_set - 1, day=31) if self.get_stats_: get_stats = m == n_val_sets - 1 else: get_stats = False yield (self.full_dataset.loc[:end_date], self.full_dataset. loc[end_date + timedelta(days=1):end_date + relativedelta(months=self.n_months_in_val_set, day=31)], get_stats) # self.train, self.test = self.time_split(self.full_dataset, self.end_date) def get_sample_weights(self, train): weights_arr = train["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True).astype('float32') weights_arr = da.where(weights_arr == 0, self.params_comb_dict['weight_for_zeros'], 1.) return weights_arr def fit(self, train): try: start_time = time.perf_counter() logging.debug( f"train X dtypes are {train[[col for col in train if col.startswith(('pc','cat'))]].dtypes}" ) logging.debug( f"train y type is {train['sid_shop_item_qty_sold_day'].dtype}") self.model.fit( train[[col for col in train if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), train["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True), sample_weight=self.get_sample_weights(train), feature_name=[ col for col in train if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in train if col.startswith("cat") ], ) assert self.model.fitted_ self.params_comb_dict["fit_times_list_"].append( time.perf_counter() - start_time) return self except Exception: logging.exception( "Exception occurred while fitting model on train data during walk-forward validation." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def predict(self, test): try: self.y_pred = self.model.predict( test[[col for col in test if col.startswith(("pc", "cat"))]]) return self except Exception: logging.exception( "Exception occurred while computing predicted values on the test data." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def rmse_all_folds(self, test, get_stats): try: # logging.debug(f"Data type of test['sid_shop_item_qty_sold_day'] is: {type(test['sid_shop_item_qty_sold_day'])}") # logging.debug(f"Data type of self.y_pred is: {type(self.y_pred)}") # logging.debug(f"Shape of test['sid_shop_item_qty_sold_day'] is: {test['sid_shop_item_qty_sold_day'].compute().shape}") # logging.debug(f"Shape of self.y_pred is: {self.y_pred.compute().shape}") self.params_comb_dict["rmse_list_"].append( calc_rmse( test["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True), self.y_pred.compute_chunk_sizes(), get_stats, )) # self.rmse_results[json.dumps(self.hyper_dict)].append(calc_rmse(test[["sid_shop_item_qty_sold_day"]], self.y_pred)) self.params_comb_dict["monthly_rmse_list_"].append( calc_monthly_rmse( test[["shop_id", "item_id", "sid_shop_item_qty_sold_day"]], self.y_pred, )) except Exception: logging.exception( "Exception occurred while computing RMSE on the test data.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def refit_and_save(self, model_path): """ https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076 """ try: self.best_model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **self.best_params_, ) self.best_model.fit( self.full_dataset[[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True, ), sample_weight=self.get_sample_weights(self.full_dataset), feature_name=[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in self.full_dataset if col.startswith("cat") ], ) output_txt = str(model_path).split("/")[-1] booster = self.best_model.booster_.save_model(output_txt) # output_txt = str(model_path).split('/')[-1] # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_txt, "sales-demand-data", output_txt) logging.info( f"Name of saved model uploaded to S3 is: {output_txt}") except (Exception, ClientError): logging.exception( "Exception occurred while fitting model on the full dataset and saving the booster to file on S3." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1)
def beta_parallel_disk_detection(dataset, probe, #rxmin=None, # these would allow selecting a sub section #rxmax=None, #rymin=None, #rymax=None, #qxmin=None, #qxmax=None, #qymin=None, #qymax=None, probe_type="FT", dask_client= None, dask_client_params:dict=None, restart_dask_client=True, close_dask_client=False, return_dask_client=True, *args, **kwargs): """ This is not fully validated currently so may not work, please report bugs on the py4DSTEM github page. This parallellises the disk detetection for all probe posistions. This can operate on either in memory or out of memory datasets There is an asumption that unless specifying otherwise you are parallelising on a single Local Machine. If this is not the case its probably best to pass the dask_client into the function, although you can just pass the required arguments to dask_client_params. If no dask_client arguments are passed it will create a dask_client for a local machine Note: Do not pass "peaks" argument as a kwarg, like you might in "_find_Bragg_disks_single_DP_FK", as the results will be unreliable and may cause the calculation to crash. Args: dataset (py4dSTEM datacube): 4DSTEM dataset probe (ndarray): can be regular probe kernel or fourier transormed probe_type (str): "FT" or None dask_client (distributed.client.Client): dask client dask_client_params (dict): parameters to pass to dask client or dask cluster restart_dask_client (bool): if True, function will attempt to restart the dask_client. close_dask_client (bool): if True, function will attempt to close the dask_client. return_dask_client (bool): if True, function will return the dask_client. *args,kwargs will be passed to "_find_Bragg_disks_single_DP_FK" e.g. corrPower, sigma, edgeboundary... Returns: peaks (PointListArray): the Bragg peak positions and the correlenation intensities dask_client(optional) (distributed.client.Client): dask_client for use later. """ #TODO add asserts abotu peaks not being passed # Dask Client stuff #TODO how to guess at default params for client, sqrt no.cores. Something to do with the size of the diffraction patterm # write a function which can do this. #TODO replace dask part with a with statement for easier clean up e.g. # with LocalCluser(params) as cluster, Client(cluster) as client: # ... dask stuff. #TODO add assert statements and other checks. Think about reordering opperations if dask_client == None: if dask_client_params !=None: dask.config.set({'distributed.worker.memory.spill': False, 'distributed.worker.memory.target': False}) cluster = LocalCluster(**dask_client_params) dask_client = Client(cluster, **dask_client_params) else: # AUTO MAGICALLY SET? # LET DASK SET? # HAVE A FUNCTION WHICH RUNS ON A SUBSET OF THE DATA TO PICK OPTIMIAL VALUE? # psutil could be used to count cores. dask.config.set({'distributed.worker.memory.spill': False, # stops spilling to disk 'distributed.worker.memory.target': False}) # stops spilling to disk and erroring out cluster = LocalCluster() dask_client = Client(cluster) else: assert type(dask_client) == distributed.client.Client if restart_dask_client: try: dask_client.restart() except Exception as e: print('Could not restart dask client. Try manually restarting outside or passing "restart_dask_client=False"') # WARNING STATEMENT return e else: pass # Probe stuff assert (probe.shape == dataset.data.shape[2:]), "Probe and Diffraction Pattern Shapes are Mismatched" if probe_type != "FT": #TODO clean up and pull out redudant parts #if probe.dtype != (np.complex128 or np.complex64 or np.complex256): #DO FFT SHIFT THING probe_kernel_FT = np.conj(np.fft.fft2(probe)) dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny)) dask_probe_delayed = dask_probe_array.to_delayed() # delayed_probe_kernel_FT = delayed(probe_kernel_FT) else: probe_kernel_FT = probe dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny)) dask_probe_delayed = dask_probe_array.to_delayed() # GET DATA #TODO add another elif if it is a dask array then pass if type(dataset.data) == np.ndarray: dask_data = da.from_array(dataset.data, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny)) elif dataset.stack_pointer != None: dask_data = da.from_array(dataset.stack_pointer, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny)) else: print("Couldn't access the data") return None # Convert the data to delayed dataset_delayed = dask_data.to_delayed() # TODO Trim data e.g. rx,ry,qx,qy # I can pass the index values in here I should trim the probe and diffraction pattern first # Into the meat of the function # create an empty list to which we will append the dealyed functions to. res = [] # loop over the dataset_delayed and create a delayed function of for x in np.ndindex(dataset_delayed.shape): temp = delayed(_find_Bragg_disks_single_DP_FK_dask_wrapper)(dataset_delayed[x], probe_kernel_FT=dask_probe_delayed[0,0], #probe_kernel_FT=delayed_probe_kernel_FT, *args, **kwargs) #passing through args from earlier or should I use #corrPower=corrPower, #sigma=sigma_gaussianFilter, #edgeBoundary=edgeBoundary, #minRelativeIntensity=minRelativeIntensity, #minPeakSpacing=minPeakSpacing, #maxNumPeaks=maxNumPeaks, #subpixel='poly') res.append(temp) _temp_peaks = dask_client.compute(res, optimize_graph=True) # creates futures and starts computing output = dask_client.gather(_temp_peaks) # gather the future objects coords = [('qx',float),('qy',float),('intensity',float)] peaks = PointListArray(coordinates=coords, shape=dataset.data.shape[:-2]) #temp_peaks[0][0] # operating over a list so we need the size (0->count) and re-create the probe positions (0->rx,0->ry), for (count,(rx, ry)) in zip([i for i in range(dataset.data[...,0,0].size)],np.ndindex(dataset.data.shape[:-2])): #peaks.get_pointlist(rx, ry).add_pointlist(temp_peaks[0][count]) #peaks.get_pointlist(rx, ry).add_pointlist(output[count][0]) peaks.get_pointlist(rx, ry).add_pointlist(output[count]) # Clean up dask_client.cancel(_temp_peaks) # removes from the dask workers del _temp_peaks # deletes the object if close_dask_client: dask_client.close() return peaks elif close_dask_client == False and return_dask_client == True: return peaks, dask_client elif close_dask_client and return_dask_client == False: return peaks else: print('Dask Client in unknown state, this may result in unpredicitable behaviour later') return peaks
def main(args): """Main function of cellanneal.""" if (args.start_temp is not None or args.end_temp is not None) and args.auto_temp == 1: raise Exception( "when auto_temp is set to 1(default value), starting temperature or ending temperature should not be set manually" ) if not args.no_parallel: import dask from dask.distributed import Client, LocalCluster if not args.cluster: cluster = LocalCluster( n_workers=args.workers, threads_per_worker=1, ) client = Client(cluster) else: cluster = args.cluster client = Client(cluster) client.restart() cwd = Path(__file__).parent.absolute() client.upload_file(cwd / 'drawing.py') client.upload_file(cwd / 'mathhelper.py') client.upload_file(cwd / 'cell.py') client.upload_file(cwd / 'colony.py') client.upload_file(cwd / 'optimization.py') client.upload_file(cwd / 'drawing.py') client.upload_file(cwd / 'global_optimization.py') client.upload_file(cwd / 'main.py') else: client = None lineagefile = None start = time.time() try: config = load_config(args.config) simulation_config = config["simulation"] #Maybe better to store the image type in the config file in the first place, instead of using cmd? if args.graySynthetic == True: simulation_config["image.type"] = "graySynthetic" elif args.phaseContrast == True: simulation_config["image.type"] = "phaseContrastImage" elif args.binary == True: simulation_config["image.type"] = "binary" else: raise ValueError( "Invalid Command: Synthetic image type must be specified") if not args.output.is_dir(): args.output.mkdir() if not args.bestfit.is_dir(): args.bestfit.mkdir() if args.residual and not args.residual.is_dir(): args.residual.mkdir() seed = int(start * 1000) % (2**32) if args.seed != None: seed = args.seed np.random.seed(seed) print("Seed: {}".format(seed)) celltype = config['global.cellType'].lower() # setup the colony from a file with the initial properties lineageframes = LineageFrames() colony = lineageframes.forward() imagefiles = get_inputfiles(args) if args.lineage_file: load_colony(colony, args.lineage_file, config, initial_frame=imagefiles[0].name) else: load_colony(colony, args.initial, config) cost_diff = (-1, -1) # open the lineage file for writing lineagefile = open(args.output / 'lineage.csv', 'w') header = ['file', 'name'] if celltype == 'bacilli': header.extend([ 'x', 'y', 'width', 'length', 'rotation', "split_alpha", "opacity" ]) print(','.join(header), file=lineagefile) if args.debug: with open(args.debug / 'debug.csv', 'w') as debugfile: print(','.join([ 'window_start', 'window_end', 'pbad_total', 'bad_count', 'temperature', 'total_cost_diff', 'current_iteration', 'total_iterations' ]), file=debugfile) if args.global_optimization: global useDistanceObjective useDistanceObjective = args.dist realimages = [ optimization.load_image(imagefile) for imagefile in imagefiles ] window = config["global_optimizer.window_size"] if args.lineage_file: lineage = global_optimization.build_initial_lineage( imagefiles, args.lineage_file, args.continue_from, config["simulation"]) else: lineage = global_optimization.build_initial_lineage( imagefiles, args.initial, args.continue_from, config["simulation"]) lineage = global_optimization.find_optimal_simulation_confs( imagefiles, lineage, realimages, args.continue_from) sim_start = args.continue_from - args.frame_first print(sim_start) shape = realimages[0].shape synthimages = [] cellmaps = [] distmaps = [] iteration_per_cell = config["iteration_per_cell"] if not useDistanceObjective: distmaps = [None] * len(realimages) for window_start in range(1 - window, len(realimages)): window_end = window_start + window print(window_start, window_end) if window_end <= len(realimages): # get initial estimate if window_start >= sim_start: if window_end > 1: lineage.copy_forward() realimage = realimages[window_end - 1] synthimage, cellmap = optimization.generate_synthetic_image( lineage.frames[window_end - 1].nodes, shape, lineage.frames[window_end - 1].simulation_config) synthimages.append(synthimage) cellmaps.append(cellmap) if useDistanceObjective: distmap = distance_transform_edt(realimage < .5) distmap /= config[ f'{config["global.cellType"].lower()}.distanceCostDivisor'] * config[ 'global.pixelsPerMicron'] distmap += 1 distmaps.append(distmap) if args.auto_temp == 1 and window_end == 1: print("auto temperature schedule started") args.start_temp, args.end_temp = \ global_optimization.auto_temp_schedule(imagefiles, lineage, realimages, synthimages, cellmaps, distmaps, 0, 1, lineagefile, args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) if args.auto_meth == "frame" and optimization.auto_temp_schedule_frame( window_end, 3): print("auto temperature schedule restarted") args.start_temp, args.end_temp = \ global_optimization.auto_temp_schedule(imagefiles, lineage, realimages, synthimages, cellmaps, distmaps, window_start, window_end, lineagefile, args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) if window_start >= sim_start: if useDistanceObjective: global_optimization.totalCostDiff = optimization.dist_objective( realimage, synthimage, distmap, cellmap, config["overlap.cost"]) else: global_optimization.totalCostDiff = optimization.objective( realimage, synthimage, cellmap, config["overlap.cost"], config["cell.importance"]) lineage, synthimages, distmaps, cellmaps = global_optimization.optimize( imagefiles, lineage, realimages, synthimages, cellmaps, distmaps, window_start, window_end, lineagefile, args, config, iteration_per_cell, client=client) if window_start >= 0: global_optimization.save_lineage( imagefiles[window_start].name, lineage.frames[window_start].nodes, lineagefile) global_optimization.save_output( imagefiles[window_start].name, synthimages[window_start], realimages[window_start], lineage.frames[window_start].nodes, args, config) return 0 config["simulation"] = optimization.find_optimal_simulation_conf( config["simulation"], optimization.load_image(imagefiles[0]), list(colony)) if args.auto_temp == 1: print("auto temperature schedule started") args.start_temp, args.end_temp = optimization.auto_temp_schedule( imagefiles[0], lineageframes.forward(), args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) frame_num = 0 prev_cell_num = len(colony) for imagefile in imagefiles: # Recomputing temperature when needed frame_num += 1 if args.auto_meth == "frame": if optimization.auto_temp_schedule_frame(frame_num, 8): print("auto temperature schedule started (recomputed)") args.start_temp, args.end_temp = optimization.auto_temp_schedule( imagefile, colony, args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) elif args.auto_meth == "factor": if optimization.auto_temp_schedule_factor( len(colony), prev_cell_num, 1.1): print("auto temperature schedule started (recomputed)") args.start_temp, args.end_temp = optimization.auto_temp_schedule( imagefile, colony, args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) prev_cell_num = len(colony) elif args.auto_meth == "const": if optimization.auto_temp_schedule_const( len(colony), prev_cell_num, 10): print("auto temperature schedule started (recomputed)") args.start_temp, args.end_temp = optimization.auto_temp_schedule( imagefile, colony, args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) prev_cell_num = len(colony) elif args.auto_meth == "cost": print(cost_diff, frame_num, optimization.auto_temp_shcedule_cost(cost_diff)) if frame_num >= 2 and optimization.auto_temp_shcedule_cost( cost_diff): print( "auto temperature schedule started cost_diff (recomputed)" ) args.start_temp, args.end_temp = optimization.auto_temp_schedule( imagefile, colony, args, config) print("auto temperature schedule finished") print("starting temperature is ", args.start_temp, "ending temperature is ", args.end_temp) colony = optimize(imagefile, lineageframes, args, config, client) cost_diff = optimization.update_cost_diff(colony, cost_diff) # flatten modifications and save cell properties colony.flatten() for cellnode in colony: properties = [imagefile.name, cellnode.cell.name] if celltype == 'bacilli': properties.extend([ str(cellnode.cell.x), str(cellnode.cell.y), str(cellnode.cell.width), str(cellnode.cell.length), str(cellnode.cell.rotation) ]) print(','.join(properties), file=lineagefile) except KeyboardInterrupt as error: raise error finally: if lineagefile: lineagefile.close() print(f'{time.time() - start} seconds') if client and not cluster: client.shutdown() return 0
def handle(self, *args, **options): # Unpack variables product = options['product'] model = options['model'] name = options['name'] training = options['training'] sp = options['spatial_aggregation'] kwargs = parser_extra_args(options['extra_kwargs']) categorical_variables = options['categorical_variables'] sample = options['sample'] filename = options['filename'] scheduler_file = options['scheduler'] remove_outliers = options['remove_outliers'] # Prepare encoding of categorical variables if any specified if categorical_variables is not None: kwargs.update(categorical_features=var_to_ind(categorical_variables)) # Load model class if filename is None: try: module = import_module('madmex.modeling.supervised.%s' % model) Model = module.Model except ImportError as e: raise ValueError('Invalid model argument') # datacube query gwf_kwargs = { k: options[k] for k in ['product', 'lat', 'long', 'region']} iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client(scheduler_file=scheduler_file) client.restart() C = client.map(extract_tile_db, iterable, pure=False, **{'sp': sp, 'training_set': training, 'sample': sample}) arr_list = client.gather(C) logger.info('Completed extraction of training data from %d tiles' , len(arr_list)) # Zip list of predictors, target into two lists X_list, y_list = zip(*arr_list) # Filter Nones X_list = [x for x in X_list if x is not None] y_list = [x for x in y_list if x is not None] # Concatenate the lists X = np.concatenate(X_list) y = np.concatenate(y_list) # Optionally run outliers removal if remove_outliers: X, y = Model.remove_outliers(X, y) # Optionally write the arrays to pickle file if filename is not None: logger.info('Writting X and y arrays to pickle file, no model will be fitted') with open(filename, 'wb') as dst: pickle.dump((X, y), dst) else: print("Fitting %s model for %d observations" % (model, y.shape[0])) # Fit model mod = Model(**kwargs) mod.fit(X, y) # Write the fitted model to the database mod.to_db(name=name, recipe=product, training_set=training)
def handle(self, *args, **options): path = os.path.join(INGESTION_PATH, 'recipes', options['name']) if not os.path.exists(path): os.makedirs(path) # Prepare a few variables try: recipe_meta = RECIPES[options['recipe']] except KeyError: raise ValueError('Selected recipe does not exist') product = recipe_meta['product'] fun = recipe_meta['fun'] yaml_file = recipe_meta['config_file'] begin = datetime.strptime(options['begin'], '%Y-%m-%d') end = datetime.strptime(options['end'], '%Y-%m-%d') time = (begin, end) center_dt = mid_date(begin, end) scheduler_file = options['scheduler'] # database query gwf_kwargs = { k: options[k] for k in ['lat', 'long', 'region', 'begin', 'end'] } gwf_kwargs.update(product=product) iterable = gwf_query(**gwf_kwargs) # Start cluster and run client = Client(scheduler_file=scheduler_file) client.restart() C = client.map(fun, iterable, pure=False, **{ 'center_dt': center_dt, 'path': path }) nc_list = client.gather(C) n_tiles = len([x for x in nc_list if x is not None]) logger.info('Processing done, %d tiles written to disk' % n_tiles) # Add product product_description = yaml_to_dict(yaml_file) pr, dt = add_product_from_yaml(yaml_file, options['name']) # Function to run on the list of filenames returned by Client.map() def index_nc_file(nc): """Helper function with tons of variables taken from the local environment """ try: print("Adding %s to datacube database" % nc) metadict = metadict_from_netcdf( file=nc, description=product_description, center_dt=center_dt, from_dt=begin, to_dt=end, algorithm=options['recipe']) add_dataset(pr=pr, dt=dt, metadict=metadict, file=nc) except Exception as e: pass [index_nc_file(x) for x in nc_list]
def compute_stats(user_reviews_csv, products_csv): client = Client('127.0.0.1:8786') client = client.restart() rev_ids = user_reviews_csv['asin'].persist() id_list = products_csv['asin'].persist() # Q1 prod_prop = (products_csv.isnull().mean() * 100).compute().round(2) rev_prop = (user_reviews_csv.isnull().mean() * 100).compute().round(2) # Q2 corr_df = user_reviews_csv[['asin', 'overall']].merge(products_csv[['asin', 'price']], on='asin') corrSeries = corr_df[['overall', 'price']].corr().compute() # Q3 priceStats = products_csv.price.describe(percentiles=[0.5]).compute() # Q4 def category_eval(df): return df['categories'].apply(clean_category) def clean_category(val): try: return ast.literal_eval(val)[0][0] except: return val clean_cats = products_csv.map_partitions(category_eval) category_counts = clean_cats.value_counts().compute() # Q5 def q5check_ids_exist(df, id_list): for i in df.to_frame().iterrows(): id_to_check = list(i[1])[0] if not (id_to_check in id_list): return 1 return 0 q5_ans = q5check_ids_exist(rev_ids, id_list) # Q6 def related_eval(df): return df['related'].apply(related_to_prod_list) def related_to_prod_list(related_dict): try: related_dict = ast.literal_eval(related_dict) return list(chain(*related_dict.values())) except: return related_dict def q6check_ids_exist(df, id_list): for i in df.iterrows(): idList = list(i[1])[0] if type(idList) == list: for related_id in idList: if not (related_id in id_list): return 1 return 0 flatRelatedIds = products_csv.map_partitions(related_eval).to_frame() q6_ans = q6check_ids_exist(flatRelatedIds, id_list) out_dict = {"q1": {"products": dict(prod_prop), "reviews": dict(rev_prop)}, "q2": corrSeries['overall']['price'].round(2), "q3": dict(priceStats.drop('count').round(2)), "q4": dict(category_counts), "q5": q5_ans, "q6": q6_ans} def convert(o): if isinstance(o, np.int64): return int(o) raise TypeError with open('results_1B.json', 'w') as outfile: json.dump(out_dict, outfile, default=convert)
def restart_workers(address=u'tcp://172.31.54.193:8786'): client = Client(address) client.restart() sleep(2) print(u'workers restarted.')
class GlideDock: """ Score structures using Glide docking score, including ligand preparation with LigPrep """ return_metrics = [ 'r_i_docking_score', 'r_i_glide_ligand_efficiency', 'r_i_glide_ligand_efficiency_sa', 'r_i_glide_ligand_efficiency_ln', 'r_i_glide_gscore', 'r_i_glide_lipo', 'r_i_glide_hbond', 'r_i_glide_metal', 'r_i_glide_rewards', 'r_i_glide_evdw', 'r_i_glide_ecoul', 'r_i_glide_erotb', 'r_i_glide_esite', 'r_i_glide_emodel', 'r_i_glide_energy', 'r_i_glide_rmsd_to_input' ] def __init__(self, prefix: str, glide_template: os.PathLike, cluster: str = None, timeout: float = 120.0, **kwargs): """ :param prefix: Prefix to identify scoring function instance (e.g., DRD2) :param glide_template: Path to a template docking file (.in) :param cluster: Address to Dask scheduler for parallel processing via dask :param timeout: Timeout (seconds) before killing an individual docking simulation :param kwargs: """ # Read in glide template (.in) with open(glide_template, 'r') as gfile: self.glide_options = gfile.readlines() # Make sure output file type is sdf self.glide_options = self.modify_glide_in(self.glide_options, 'POSE_OUTTYPE', 'ligandlib_sd') # Specify class attributes self.prefix = prefix.replace(" ", "_") self.glide_metrics = GlideDock.return_metrics self.glide_env = os.path.join(os.environ['SCHRODINGER'], 'glide') self.ligprep_env = os.path.join(os.environ['SCHRODINGER'], 'ligprep') self.timeout = float(timeout) self.cluster = cluster if self.cluster is not None: self.client = Client(self.cluster) self.variants = None self.docking_results = None @staticmethod def modify_glide_in(glide_in: str, glide_property: str, glide_value: str): """ Convenience function to insert / overwrite certain .in file properties and values :param glide_in: A string of the .in file :param property: Property to be changed (e.g. POSE_OUTTYPE) :param value: Value to change to (e.g. ligandlib_sd) :return: Modified glide_in """ # If property is already present, replace value if any( [True if glide_property in line else False for line in glide_in]): for i in range(len(glide_in)): if glide_property in glide_in[i]: glide_in[i] = f'{glide_property} {glide_value}\n' break # Otherwise insert it before newline (separates properties from features) elif any([True if line == '\n' else False for line in glide_in]): for i in range(len(glide_in)): if glide_in[i] == '\n': glide_in.insert(i, f'{glide_property} {glide_value}\n') break # Otherwise just stick it on the end of the file else: glide_in.append(f'{glide_property} {glide_value}\n') return glide_in def run_ligprep(self, smiles: list): """ Call ligprep to prepare molecules. :param smiles: List of SMILES strings """ ligprep_commands = [] # Write out smiles to sdf files and prepare ligprep commands for smi, name in zip(smiles, self.file_names): smi_in = os.path.join(self.directory, f'{name}.smi') sdf_out = os.path.join(self.directory, f'{name}_ligprep.sdf') with open(smi_in, 'w') as f: f.write(smi) command = " ".join( (self.ligprep_env, f"-ismi {smi_in}", f"-osd {sdf_out}", "-ph 7.0", "-pht 1.0", "-bff 16", "-s 8", "-epik", "-WAIT", "-NOJOBID")) ligprep_commands.append(command) # Initialize subprocess logger.debug('LigPrep called') p = timedSubprocess(timeout=self.timeout).run # Run commands either using Dask or sequentially if self.cluster is not None: futures = self.client.map(p, ligprep_commands) _ = self.client.gather(futures) else: _ = [p(command) for command in ligprep_commands] logger.debug('LigPrep finished') return self @property def split_sdf(self): """ Split ligprep output sdf so that each variant can be independently run. """ # Read in ligprep output files and split to individual variants self.variants = {name: [] for name in self.file_names} for name in self.file_names: out_file = os.path.join(self.directory, f'{name}_ligprep.sdf') if os.path.exists(out_file): supp = Chem.rdmolfiles.ForwardSDMolSupplier(os.path.join( self.directory, f'{name}_ligprep.sdf'), sanitize=False, removeHs=False) for mol in supp: if mol: variant = mol.GetPropsAsDict()['s_lp_Variant'] if ':' in variant: variant = variant.split(':')[1] variant = variant.split('-')[1] self.variants[name].append(variant) w = Chem.rdmolfiles.SDWriter( os.path.join(self.directory, f'{name}-{variant}_ligprep.sdf')) w.write(mol) w.flush() w.close() logger.debug(f'Split {name} -> {name}-{variant}') else: continue return self def run_glide(self): """ Write GLIDE new input files and submit each to Glide """ glide_commands = [] for name in self.file_names: for variant in self.variants[name]: # Set some file paths glide_in = self.glide_options.copy() # Change glide_in file glide_in = self.modify_glide_in( glide_in, 'LIGANDFILE', os.path.join(self.directory, f'{name}-{variant}_ligprep.sdf')) glide_in = self.modify_glide_in(glide_in, 'OUTPUTDIR', os.path.join(self.directory)) # Write new input file (.in) with open(os.path.join(self.directory, f'{name}-{variant}.in'), 'wt') as f: [f.write(line) for line in glide_in] # Prepare command line command command = self.glide_env + ' -WAIT -NOJOBID -NOLOCAL ' + \ os.path.join(self.directory, f'{name}-{variant}.in') glide_commands.append(command) # Initialize subprocess logger.debug('Glide called') p = timedSubprocess(timeout=self.timeout).run if self.cluster is not None: futures = self.client.map(p, glide_commands) _ = self.client.gather(futures) else: _ = [p(command) for command in glide_commands] logger.debug('Glide finished') return self def get_docking_scores(self, smiles: list, return_best_variant: bool = False): """ Read output sdfs, get output properties :param smiles: List of SMILES strings :param return_best_variant: :return optional, list of filenames with best variant """ # Read in docked file best_variants = self.file_names.copy() best_score = {name: None for name in self.file_names} # For each molecule for i, (smi, name) in enumerate(zip(smiles, self.file_names)): docking_result = {'smiles': smi} # For each variant for variant in self.variants[name]: out_file = os.path.join(self.directory, f'{name}-{variant}_lib.sdfgz') if os.path.exists(out_file): # Try to load it in, and grab the score try: with gzip.open(out_file) as f: glide_out = Chem.ForwardSDMolSupplier(f) for mol in glide_out: # should just be one dscore = mol.GetPropsAsDict( )['r_i_docking_score'] # If molecule doesn't have a score yet append it and the variant if best_score[name] is None: best_score[name] = dscore best_variants[i] = f'{name}-{variant}' docking_result.update({ f'{self.prefix}_' + k: v for k, v in mol.GetPropsAsDict().items() if k in self.glide_metrics }) logger.debug( f'Docking score for {name}-{variant}: {dscore}' ) # If docking score is better change it... elif dscore < best_score[name]: best_score[name] = dscore best_variants[i] = f'{name}-{variant}' docking_result.update({ f'{self.prefix}_' + k: v for k, v in mol.GetPropsAsDict().items() if k in self.glide_metrics }) logger.debug( f'Found better {name}-{variant}: {dscore}' ) # Otherwise ignore else: pass # If parsing the molecule threw an error and nothing stored, append 0 except: logger.debug( f'Error processing {name}-{variant}_lib.sdfgz file' ) if best_score[ name] is None: # Only if no other score for prefix best_variants[i] = f'{name}-{variant}' docking_result.update({ f'{self.prefix}_' + k: 0.0 for k in self.glide_metrics }) logger.debug( f'Returning 0.0 unless a successful variant is found' ) # If path doesn't exist and nothing store, append 0 else: logger.debug(f'{name}-{variant}_lib.sdfgz does not exist') if best_score[ name] is None: # Only if no other score for prefix best_variants[i] = f'{name}-{variant}' docking_result.update({ f'{self.prefix}_' + k: 0.0 for k in self.glide_metrics }) logger.debug( f'Returning 0.0 unless a successful variant is found' ) # Add best variant information to docking result docking_result.update( {f'{self.prefix}_best_variant': best_variants[i]}) self.docking_results.append(docking_result) logger.debug(f'Best scores: {best_score}') if return_best_variant: logger.debug(f'Returning best variants: {best_variants}') return best_variants return self def remove_files(self, keep: list = [], parallel: bool = True): """ Remove some of the log files and molecule files. :param keep: List of filenames to keep pose files for. :param parallel: Whether to run using Dask (requires scheduler address during initialisation). """ # If no cluster is provided ensure parallel is False if (parallel is True) and (self.cluster is None): parallel = False keep_poses = [f'{k}_lib.sdfgz' for k in keep] logger.debug(f'Keeping pose files: {keep_poses}') del_files = [] for name in self.file_names: # Grab files files = glob.glob(os.path.join(self.directory, f'{name}*')) logger.debug(f'Glob found {len(files)} files') if len(files) > 0: try: files = [ file for file in files if not ("log.txt" in file) and not any([p in file for p in keep_poses]) ] if parallel: [del_files.append(file) for file in files] else: [os.remove(file) for file in files] # No need to stop if files can't be found and deleted except FileNotFoundError: logger.debug('File not found.') pass if parallel: futures = self.client.map(os.remove, del_files) _ = self.client.gather(futures) return self def __call__(self, smiles: list, directory: str, file_names: list, **kwargs): """ Calculate scores for GlideDock :param smiles: List of SMILES strings :param directory: Directory to save files and logs into :param file_names: List of corresponding file names for SMILES to match files to index :param kwargs: Ignored :return: List of dicts i.e. [{'smiles': smi, 'metric': 'value', ...}, ...] """ # Assign some attributes step = file_names[0].split("_")[0] # Assume first Prefix is step # Create log directory self.directory = os.path.join(os.path.abspath(directory), 'GlideDock', step) if not os.path.exists(self.directory): os.makedirs(self.directory) self.file_names = file_names self.docking_results = [] # make sure no carry over # Add logging file handler fh = logging.FileHandler( os.path.join(self.directory, f'{step}_log.txt')) fh.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) # Refresh Dask every few hundred iterations if self.cluster is not None: if int(step) % 250 == 0: self.client.restart() # Run protocol self.run_ligprep(smiles=smiles) self.split_sdf # Catch any erroneous smiles with no output ligprep file self.run_glide() best_variants = self.get_docking_scores(smiles=smiles, return_best_variant=True) # Cleanup self.remove_files(keep=best_variants, parallel=True) fh.close() logger.removeHandler(fh) self.directory = None self.file_names = None self.variants = None # Check assert len(smiles) == len(self.docking_results) return self.docking_results
class Cluster(object): """ Wrapper for ``dask`` clients Best practices: "By "node" people typically mean a physical or virtual machine. That node can run several programs or processes at once (much like how my computer can run a web browser and text editor at once). Each process can parallelize within itself with many threads. Processes have isolated memory environments, meaning that sharing data within a process is free, while sharing data between processes is expensive. Typically things work best on larger nodes (like 36 cores) if you cut them up into a few processes, each of which have several threads. You want the number of processes times the number of threads to equal the number of cores. So for example you might do something like the following for a 36 core machine: Four processes with nine threads each Twelve processes with three threads each One process with thirty-six threads Typically one decides between these choices based on the workload. The difference here is due to Python's Global Interpreter Lock, which limits parallelism for some kinds of data. If you are working mostly with Numpy, Pandas, Scikit-Learn, or other numerical programming libraries in Python then you don't need to worry about the GIL, and you probably want to prefer few processes with many threads each. This helps because it allows data to move freely between your cores because it all lives in the same process. However, if you're doing mostly Pure Python programming, like dealing with text data, dictionaries/lists/sets, and doing most of your computation in tight Python for loops then you'll want to prefer having many processes with few threads each. This incurs extra communication costs, but lets you bypass the GIL. In short, if you're using mostly numpy/pandas-style data, try to get at least eight threads or so in a process. Otherwise, maybe go for only two threads in a process." --MRocklin (https://stackoverflow.com/questions/51099685/best-practices-in-setting-number-of-dask-workers) Examples: >>> # I/O-heavy task with 8 nodes >>> cluster = Cluster(n_workers=4, >>> threads_per_worker=2, >>> scheduler_port=0, >>> processes=False) >>> >>> # Task with little need of the GIL with 16 nodes >>> cluster = Cluster(n_workers=2, >>> threads_per_worker=8, >>> scheduler_port=0, >>> processes=False) """ def __init__(self, **kwargs): self.kwargs = kwargs self.cluster = None self.client = None def start(self): self.cluster = LocalCluster(**self.kwargs) self.client = Client(self.cluster) def restart(self): self.client.restart() print(self.client) def stop(self): self.client.close() self.cluster.close() self.client = None self.cluster = None
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client client = Client('127.0.0.1:8786') client.restart() bc = BlazingContext(dask_client=client, network_interface="lo") # bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/100MB2Part/' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) bc.create_table('customer', dir_data_fs + '/customer_*.parquet') # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = "select sum(c_custkey)/count(c_custkey), min(c_custkey) from customer limit 5" # [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment'] lp = bc.explain(query)
def connect( args: argparse.Namespace ) -> typing.Tuple[dask.distributed.Client, typing.Optional[dask.distributed.LocalCluster]]: """ Connect to the dask cluster specifed by the arguments in `args` Specifically, this function uses args.cluster_location to determine whether to start a dask.distributed.LocalCluster (in case args.cluster_location is "LOCAL") or to (attempt to) connect to an existing cluster (any other value). If a local cluster is started, it will use a number of worker processes equal to args.num_procs. Each process will use args.num_threads_per_proc threads. The scheduler for the local cluster will listen to a random port. Parameters ---------- args: argparse.Namespace A namespace containing the following fields: * cluster_location * client_restart * num_procs * num_threads_per_proc Returns ------- client: dask.distributed.Client The client for the dask connection cluster: dask.distributed.LocalCluster or None If a local cluster is started, the reference to the local cluster object is returned. Otherwise, None is returned. """ from dask.distributed import Client as DaskClient from dask.distributed import LocalCluster as DaskCluster client = None cluster = None if args.cluster_location == "LOCAL": msg = "[dask_utils]: starting local dask cluster" logger.info(msg) cluster = DaskCluster(n_workers=args.num_procs, processes=True, threads_per_worker=args.num_threads_per_proc) client = DaskClient(cluster) else: msg = "[dask_utils]: attempting to connect to dask cluster: {}" msg = msg.format(args.cluster_location) logger.info(msg) client = DaskClient(address=args.cluster_location) if args.client_restart: msg = "[dask_utils]: restarting client" logger.info(msg) client.restart() return client, cluster