def test_extract_partitions_futures(nrows, ncols, n_parts, X_delayed, y_delayed, colocated, cluster): client = Client(cluster) try: X = cp.random.standard_normal((nrows, ncols)) y = cp.random.standard_normal((nrows, )) X = da.from_array(X, chunks=(nrows / n_parts, -1)) y = da.from_array(y, chunks=(nrows / n_parts, )) if not X_delayed: X = client.persist(X) if not y_delayed: y = client.persist(y) if colocated: ddh = DistributedDataHandler.create((X, y), client) else: ddh = DistributedDataHandler.create(X, client) parts = list(map(lambda x: x[1], ddh.gpu_futures)) assert len(parts) == n_parts finally: client.close()
def array_images(): custom_imread = dask.delayed(skimage.io.imread, pure=True) images = [ custom_imread( '/Users/nivethamahalakshmibalasamy/Documents/ECI-PolarScience/dask_stuff/grayscale-xy-%d.png' % i) for i in range(1376, 1396) ] #print images image_array = [ da.from_delayed(i, sample.shape, sample.dtype) for i in images ] sizes = [j.shape for j in image_array] #print sizes stack = da.stack(image_array, axis=0) print stack #print stack[0] # Combining chunks - A chunk consists of 5 images stack = stack.rechunk((5, 2000, 2000)) print "After rechunking: " temp = stack #temp.visualize() print "Before distributing to workers:" print stack.mean().compute() print stack[1, :].compute() print stack[19, :].mean().compute() stack.visualize() # Distribute array components over workers and centralized scheduler cluster = LocalCluster() client = Client(cluster) print client # Load the entire distributed array on the cluster (4 workers, 4 cores) stack = client.persist(stack) #print stack.shape #print "After distributing to workers: " print stack.mean().compute() # map the otsu thresholding function #print stack[0] stack = da.map_blocks(otsu_thresholding, stack, chunks=(5, 2000, 2000), dtype=sample.dtype) stack = da.map_blocks(blob_detection, stack, chunks=(5, 2000, 2000), dtype=sample.dtype) stack = client.persist(stack) #th = client.persist(th) #thresholded.visualize() #th = client.persist(thresholded) #print thresholded.mean().compute() #print thresholded #print stack.shape print stack.mean().compute() stack.visualize()
def run_variant_processing_pipeline(input: str, client: Client = None ) -> Dict[str, ddf.DataFrame]: """ Completely process a set of variants from a given GVF file, isolate and return variant metadata and effects. arguments input: input variant filepath in GVF format client: dask client object returns a dict containing two keys dict: effects: dataframe containing variant effects metadata: dataframe containing variant metadata """ client = get_client() if client is None else client ## Read and process to an intermediate format processed_df = _process_variants(input) processed_df = client.persist(processed_df) ## Isolate variant metadata and effects effect_df = isolate_variant_effects(processed_df) meta_df = isolate_variant_metadata(processed_df) return { 'effects': effect_df, 'metadata': meta_df, }
def main(): args = get_args() client = Client('127.0.0.1:8786') ncores = sum(client.ncores().values()) pd.set_option('display.large_repr', 'truncate') pd.set_option('display.max_columns', 0) # noqa pd.set_option('display.max_rows', 1000) # noqa cann_group_df = make_cann_group_df(num_products=100) df = read_df(args, cann_group_df['productKey']) logger.info('Setting index') df = df.set_index('customerKey', drop=True) logger.info('Repartitioning') df = df.repartition(npartitions=ncores) logger.info('Mapping Cann Group') df['cannGroupKey'] = df['productKey'].map(cann_group_df['cannGroupKey']) logger.info('Persisting') df = client.persist(df) logger.info('Cann Groups') for cann_group_key in cann_group_df['cannGroupKey'].unique().tolist(): print('Filtering Cann Group %s' % cann_group_key) cann_df = df[df['cannGroupKey'] == cann_group_key] print('This df: %s' % (len(cann_df), )) with Timer('%s' % (cann_group_key, )): calculate_switching(cann_df) return
def process_tasks(tasks: Iterable[Task], proc: TaskProc, client: Client, sink: S3COGSink, check_exists: bool = True, chunked_persist: int = 0, verbose: bool = True) -> Iterator[str]: def prep_stage(tasks: Iterable[Task], proc: TaskProc) -> Iterator[Tuple[Union[xr.Dataset, xr.DataArray, None], Task, str]]: for task in tasks: path = sink.uri(task) if check_exists: if sink.exists(task): yield (None, task, path) continue ds = proc(task) yield (ds, task, path) in_flight_cogs: Set[Future] = set() for ds, task, path in _with_lookahead1(prep_stage(tasks, proc)): if ds is None: if verbose: print(f"..skipping: {path} (exists already)") yield path continue if chunked_persist > 0: assert isinstance(ds, xr.DataArray) ds = chunked_persist_da(ds, chunked_persist, client) else: ds = client.persist(ds, fifo_timeout='1ms') if len(in_flight_cogs): done, in_flight_cogs = drain(in_flight_cogs, 1.0) for r in done: yield r if isinstance(ds, xr.DataArray): attrs = ds.attrs.copy() ds = ds.to_dataset(dim='band') for dv in ds.data_vars.values(): dv.attrs.update(attrs) cog = client.compute(sink.dump(task, ds), fifo_timeout='1ms') rr = dask_wait(ds) assert len(rr.not_done) == 0 del ds, rr in_flight_cogs.add(cog) done, _ = drain(in_flight_cogs) for r in done: yield r
def parquet_to_dask(context: MLClientCtx, parquet_url: Union[DataItem, str, Path, IO[AnyStr]], inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, shards: int = 4, threads_per: int = 4, processes: bool = False, memory_limit: str = '2GB', persist: bool = True, dask_key: str = 'my_dask_dataframe', target_path: str = '') -> None: """Load parquet dataset into dask cluster If no cluster is found loads a new one and persist the data to it. It shouold not be necessary to create a new cluster when the function is run as a 'dask' job. :param context: the function context :param parquet_url: url of the parquet file or partitioned dataset as either artifact DataItem, string, or path object (see pandas read_csv) :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param shards: number of workers to launch :param threads_per: number of threads per worker :param processes: """ if hasattr(context, 'dask_client'): context.logger.info('found cluster...') dask_client = context.dask_client else: context.logger.info('starting new cluster...') cluster = LocalCluster(n_workers=shards, threads_per_worker=threads_per, processes=processes, memory_limit=memory_limit) dask_client = Client(cluster) context.logger.info(dask_client) df = dd.read_parquet(parquet_url) if persist and context: df = dask_client.persist(df) dask_client.publish_dataset(dask_key=df) context.dask_client = dask_client # share the scheduler filepath = os.path.join(target_path, 'scheduler.json') dask_client.write_scheduler_file(filepath) context.log_artifact('scheduler', target_path=filepath) print(df.head())
# - Load data from files # - Filter data to a particular subset # - Shuffle data to set an intelligent index # - Several complex queries on top of this indexed data # # It is often ideal to load, filter, and shuffle data once and keep this result # in memory. Afterwards, each of the several complex queries can be based off of # this in-memory data rather than have to repeat the full load-filter-shuffle # process each time. To do this, use the client.persist method # We import data from S3 (or from pandas) in Xseconds. It will be a great choice # to use client.persist method in order to avoid this execution time each time # we want to process our data. user_item = df_interactions[['msno', 'song_id', 'interacted']] user_item = client.persist(user_item, retries=2) user_item_grouped = user_item.groupby(['msno', 'song_id'])['interacted'].max() user_item_grouped_pd = client.compute(user_item_grouped).result() type(user_item_grouped_pd) dtype_items = {'song_length': np.int32, 'language': np.float32} df_items = pd.read_csv('items2.csv', dtype=dtype_items) df_inter['date'] = 0 #MAKE RECOMMENDATIONS rec = robo.Recommender(df_items=df_items, df_reviews=df_inter, user_item_grouped=user_item_grouped_pd, item_name_colname='name',
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha / np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("augmented: {}".format(gamma.shape)) vis_aux = vis_to_real(vis_arr) logger.info("vis mean: {} shape: {}".format( np.mean(vis_aux), vis_aux.shape)) tol = min(alpha / 1e4, 1e-10) logger.info("Solving tol={} ...".format(tol)) # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s), # tol=1e-6, # l1_ratio = 0.01, # max_iter=100000, # positive=True) if False: ( sky, lstop, itn, r1norm, r2norm, anorm, acond, arnorm, xnorm, var, ) = scipy.sparse.linalg.lsqr(gamma, vis_aux, damp=alpha, show=True) logger.info( "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format( alpha, itn, r2norm, xnorm)) else: reg = linear_model.Ridge(alpha=alpha, tol=tol, solver="lsqr", max_iter=100000) reg.fit(gamma, vis_aux) logger.info(" Solve Complete, iter={}".format(reg.n_iter_)) sky = da.from_array(reg.coef_) residual = vis_aux - gamma @ sky sky, residual_norm, solution_norm = da.compute( sky, np.linalg.norm(residual)**2, np.linalg.norm(sky)**2) score = reg.score(gamma, vis_aux) logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format( alpha, score, residual_norm, solution_norm)) else: from dask_ml.linear_model import LinearRegression import dask_glm from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info("Starting Dask Client") if True: cluster = LocalCluster(dashboard_address=":8231", processes=False) client = Client(cluster) else: client = Client("tcp://localhost:8786") logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2 * np.pi * 1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array( np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s, ), ) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info("Gamma Shape: {}".format(gamma.shape)) # gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info("Gamma Shape: {}".format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array( np.array( np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32, )) # logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() # dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) # dv = da.from_array(vis_aux) dask.config.set({"array.chunk-size": "1024MiB"}) A = da.rechunk(proj_operator, chunks=("auto", n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression( penalty=en, C=1.0 / lambduh, fit_intercept=False, solver="lbfgs", max_iter=1000, tol=1e-8, ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info("Loss function: {}".format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=False) return sky.reshape(-1, 1)
def run_complete_hg38_variant_processing_pipeline( indir: str = None, effect_dir: str = None, meta_dir: str = None, client: Client = None) -> List[Future]: """ Run the variant processing pipeline step for all hg38 build chromosomes. Basically a wrapper for run_variant_processing_pipeline but also saves the results to disk. arguments indir: directory filepath containing hg38 variant files effect_dir: output directory to save variant effects meta_dir: output directory to save variant metadata client: dask client object returns a list of futures containing dicts to effect and metadata filepaths for variants on each chromosome. Dicts have the following format: dict: effects: future containing filepath to processed effects metadata: future containing filepath to processed metadata """ ## This shouldn't be necessary since this function should never be called when ## a mouse genome build is specified but w/e globals = Globals().reinitialize(build='hg38') if indir is None: indir = globals.dir_variant_raw if effect_dir is None: effect_dir = globals.dir_variant_effects if meta_dir is None: meta_dir = globals.dir_variant_meta client = get_client() if client is None else client futures = [] ## The input directory should have a bunch of GVF files if no custom output filenames ## were used for fp in glob(f'{indir}/*.gvf'): log._logger.info(f'Starting work on {fp}') results = run_variant_processing_pipeline(fp) ## Persist the processed dataframes onto the cluster workers effects = client.persist(results['effects']) metadata = client.persist(results['metadata']) ## Save the processed dataframes in the background effect_future = dfio.save_dataframe_async( effects, Path(effect_dir, Path(Path(fp).stem).with_suffix('.tsv')).as_posix()) meta_future = dfio.save_dataframe_async( metadata, Path(meta_dir, Path(Path(fp).stem).with_suffix('.tsv')).as_posix()) futures.append({'effects': effect_future, 'metadata': meta_future}) return futures
def main( endpoint: str = endpoint, port: int = 443, protocol: str = "https", collection_name: str = collection_name, in_path: Path = in_path, api_key: str = os.getenv("TYPESENSE_API_KEY"), drop: bool = False, ): tprint("Reguleque") tsClient = ts.Client({ "api_key": api_key or typer.prompt(LOG_PROMPT + " Typesense Admin API Key", type=str), "nodes": [{ "host": os.getenv("TYPESENSE_HOST") or endpoint, "port": port, "protocol": protocol, }], }) typer.echo(LOG_INFO + f" Connected to Typesense at {protocol}://{endpoint}:{port}") daskClient = Client() typer.echo( LOG_INFO + f" Started cluster, you can monitor at {daskClient.dashboard_link}") # List all the files that need loading filepaths = list(in_path.rglob("**/*.csv")) typer.secho(LOG_INFO + f" Found {len(filepaths)} files to load.") try: # Drop pre-existing collection if any if drop: confirm_drop = typer.confirm( LOG_WARN + " Are you sure you want to delete all documents in the cluster and recreate the schema?" ) if not confirm_drop: typer.echo(LOG_ERR + " Canceling execution.", err=True) raise typer.Abort() typer.echo( LOG_WARN + " Drop mode has been enabled, dropping all documents and recreating schema...", err=True, ) try: tsClient.collections[collection_name].delete() except Exception: pass # Create collection with the manual schema tsClient.collections.create(REVENUE_SCHEMA) typer.secho(LOG_INFO + " Created new schema.") # Load all files typer.secho(LOG_INFO + " Processing and uploading documents...") responses: List[List[str]] = [] for filepath in filepaths: entries: List[dict] = process_file(filepath) response: List[str] = import_entries(entries, filepath, tsClient) responses.append(response) responses = daskClient.persist(responses) progress(responses) responses = daskClient.gather(responses) sleep(2) typer.secho( "\n" + LOG_INFO + f" Finished processing and uploading {len(filepaths)} documents.") except ts.exceptions.RequestUnauthorized: typer.echo(LOG_ERR + " Invalid API key or insufficient permissions.", err=True) raise typer.Abort()
start1 = t.time() print('cpu_count: %d' % cpu_count()) path_fmt = '/mnt/storage-ssd/luokaida/data/rawdata/3C129_pband_target.ms' all_path = [path_fmt for i in range(1)] print('the number of ms_files: %d\n' % len(all_path)) files = db.from_sequence(all_path) avg_ms = files.map(load_msdata).map(time_average, avg_time=2).map(channel_average, avg_channel=4) combine_ms = avg_ms.flatten().foldby(key=lambda x: x.tag, binop=combine_msdata) # groups = avg_ms.flatten().groupby(grouper=lambda x: x.tag) # # example_source = avg_ms.flatten().filter(lambda x: x.tag == '0_2').fold(combine_msdata) print('Start computation in the background....... ' ) # construct task graphs above all_avg_ms = c.persist(combine_ms) # Persist dask collections on cluster res = c.gather(all_avg_ms) wait(res) # persist()--->gather() # res = avg_ms.compute() # Don't use .compute() end1 = t.time() # combine_ms.visualize(filename='flow_chart_20.pdf') print('\nUsing dask-xarray:map-reduce costs %.2f seconds\n' % (end1 - start1)) print() # put a breakpoint here
'3166-1-alpha-3', 'DateTime' ] covid19_merge1 = covid19_merge1.drop(remove_cols, axis=1) covid19_merge1 = covid19_merge1.fillna(0) #prepared_data = covid19_merge1.copy() #### 5e changement : exécuté avec des "workers" qui acceptent 8Go max, il y aura une consommation excessive de la mémoire #### On avait 146 partitions, on diminue de moitié la taille des partitions actuelles print("Partitions avant: ", covid19_merge1.npartitions) covid19_merge1 = covid19_merge1.repartition( npartitions=covid19_merge1.npartitions * 2) print("Partitions après: ", covid19_merge1.npartitions) #print("covid19_merge1 partition :",covid19_merge1.npartitions) prepared_data = client.persist(covid19_merge1) ## Encode Pays from dask_ml.preprocessing import LabelEncoder encoded_countries = LabelEncoder().fit_transform(prepared_data.country_name) prepared_data['country_name'] = encoded_countries ## Encode Date dates = prepared_data.date.apply(lambda x: x.strftime('%Y%m%d')) encoded_dates = LabelEncoder().fit_transform(dates) prepared_data['date'] = encoded_dates print(f"Le dataset contient {len(prepared_data)} enregistrements") print("Sample dataset final:") print(prepared_data.head(5))
def run_complete_hg38_annotation_pipeline(variant_dir: str = None, gene_fp: str = None, intergenic_dir: str = None, intragenic_dir: str = None, client: Client = None): """ Run the complete annotation pipeline starting from processed hg38 gene and variant files. Save annotated datasets to files. :param client: :return: """ if variant_dir is None: variant_dir = Globals().reinitialize(build='hg38').dir_variant_effects if gene_fp is None: gene_fp = Globals().reinitialize(build='hg38').fp_gene_meta if intergenic_dir is None: intergenic_dir = Globals().reinitialize( build='hg38').dir_annotated_inter if intragenic_dir is None: intragenic_dir = Globals().reinitialize( build='hg38').dir_annotated_intra client = get_client() if client is None else client ## List of Futures for annotated, intergenic, and mapping stats data futures = [] ## Read/parse processed genes into a dask dataframe gene_df = _read_processed_genes(gene_fp) log._logger.info(f'Reading files in {variant_dir}') ## The input directory should have variant effect TSV files if no custom output ## filenames were used and the pipeline has used default settings to this point for fp in glob(f'{variant_dir}/*.tsv'): log._logger.info(f'Working on {fp}') ## Read/parse processed variants into a dask dataframe variant_df = _read_processed_variants(fp) ## Annotate annotations = run_annotation_pipeline(variant_df, gene_df) intergenic = annotations['intergenic'] intragenic = annotations['intragenic'] intergenic = client.persist(intergenic) intragenic = client.persist(intragenic) ## Save to files inter_future = dfio.save_dataframe_async( intergenic, Path(intergenic_dir, Path(fp).name).as_posix()) intra_future = dfio.save_dataframe_async( intragenic, Path(intragenic_dir, Path(fp).name).as_posix()) futures.append({ 'intergenic': inter_future, 'intragenic': intra_future }) return futures
for r in rows: for w in world: procs = int(math.ceil(w / TOTAL_NODES)) print("procs per worker", procs, flush=True) assert procs <= 16 stop_dask() start_dask(procs, min(w, TOTAL_NODES)) client = Client("v-001:8786") df_l = dd.read_csv(f"~/temp/twx/{scale}/{r}/{w}/csv1_*.csv").repartition(npartitions=w) df_r = dd.read_csv(f"~/temp/twx/{scale}/{r}/{w}/csv2_*.csv").repartition(npartitions=w) client.persist([df_l, df_r]) print("left rows", len(df_l), flush=True) print("right rows", len(df_r), flush=True) try: for i in range(it): t1 = time.time() out = df_l.merge(df_r, on='0', how='inner', suffixes=('_left','_right')).compute() t2 = time.time() print(f"###time {r} {w} {i} {(t2 - t1)*1000:.0f}, {len(out)}", flush=True) client.restart() client.close()
f = h5py.File(os.path.join('data', 'random.hdf5'), mode='r') dset = f['/x'] import dask.array as da x = da.from_array(dset, chunks=(1000000, )) get_ipython().magic('time x.sum().compute()') get_ipython().magic('time x.sum().compute()') # If, instead, we persist the data to RAM up front (this takes a few seconds to complete - we could `wait()` on this process), then further computations will be much faster. # In[12]: # changes x from a set of delayed prescritions # to a set of futures pointing to data in RAM # See this on the UI dashboard. x = c.persist(x) # In[13]: get_ipython().magic('time x.sum().compute()') get_ipython().magic('time x.sum().compute()') # Naturally, persisting every intermediate along the way is a bad idea, because this will tend to fill up all available RAM and make the whole system slow (or break!). The ideal persist point is often at the end of a set of data cleaning steps, when the data is in a form which will get queried often. # **Exercise**: how is the memory associated with `x` released, once we know we are done with it? # The worker assumes that when `.compute()` is called, we no longer need the data. Thus, sending a message to RAM to clear that block in memory. # ## Debugging # When something goes wrong in a distributed job, it is hard to figure out what the problem was and what to do about it. When a task raises an exception, the exception will show up when that result, or other result that depend upon it, is gathered.
class LargeELMRegressor(_BaseELM, RegressorMixin): """ELM Regressor for larger-than-memory problems. Uses `Dask <https://dask.org>`_ for batch analysis of data in Parquet files. .. attention:: Why do I need Parquet files? Parquet files provide necessary information about the data without loading whole file content from disk. It makes a tremendous runtime difference compared to simpler `.csv` or `.json` file formats. Reading from files saves memory by loading data in small chunks, supporting arbitrary large input files. It also solves current memory leaks with Numpy matrix inputs in Dask. Any data format can be easily converted to Parquet, see `Analytical methods <techniques.html>`_ section. HDF5 is almost as good as Parquet, but performs worse with Dask due to internal data layout. .. todo: Write converters. .. todo: Memo about number of workers: one is good, several cover disk read latency but need more memory. On one machine matrix operators always run in parallel, do not benefit from Dask. .. todo: Memory consumption with large number of neurons - 100,000 neurons require 200GB or swap space, with read+write reaching 1GB/s. Suggested a fast SSD, or HDD + extra workers to hide swap latency. Mention that Dask is not the perfect solution, kept here for future updates. And it actually solves stuff larger than memory, albeit at a very high time+swap cost. .. todo: Avoid large batch sizes as workers can fail, safe bet is 2000-5000 range. .. todo: Fast HtH and in-place Cholesky solver. .. todo: Pro tip in documentation: run ELM with dummy 1000 data samples and 1e+9 regularization, This will test possible memory issues for workers without wasting your time on computing full HH. .. todo: Option to keep full HH permanently somewhere at disk. Saves before the final step, avoids failures from memory issues during Cholesky solver. .. todo: GPU + batch Cholesky solver, for both ELM and LargeELM. Requirements ------------ * Pandas * pyarrow * python-snappy Parameters ---------- batch_size : int Batch size used for both data samples and hidden neurons. With batch Cholesky solver, allows for very large numbers of hidden neurons of over 100,000; limited only by the computation time and disk swap space. .. hint:: Include bias and original features for best performance. ELM will include a bias term (1 extra feature), and the original features with `include_original_features=True`. For optimal performance, choose `batch_size` to be equal or evenly divide the `n_neurons + 1 (bias) + n_inputs (if include_original_features=True)`. .. todo:: Exact batch_size vs. GPU performance """ def __del__(self): if hasattr(self, 'client_'): self.client_.close() self.cluster_.close() def _setup_dask_client(self): self.cluster_ = LocalCluster( n_workers=4, threads_per_worker=1, local_dir="/Users/akusok/wrkdir/dask-temp", memory_limit="8GB") self.client_ = Client(self.cluster_) W_list = [hl.projection_.components_ for hl in self.hidden_layers_] W_dask = [da.from_array(_dense(W), chunks=self.bsize_) for W in W_list] self.W_ = self.client_.persist(W_dask) def foo(): import os os.environ['OMP_NUM_THREADS'] = '1' self.client_.run(foo) print("Running on:", self.client_) try: dashboard = self.client_.scheduler_info()['address'].split(":") dashboard[0] = "http" dashboard[-1] = str( self.client_.scheduler_info()['services']['dashboard']) print("Dashboard at", ":".join(dashboard)) except: pass def _project(self, X_dask): """Compute hidden layer output with Dask functionality. """ H_list = [] for hl, W in zip(self.hidden_layers_, self.W_): if hl.hidden_layer_ == HiddenLayerType.PAIRWISE: H0 = X_dask.map_blocks(pairwise_distances, W, dtype=X_dask.dtype, chunks=(X_dask.chunks[0], (W.shape[0], )), metric=hl.pairwise_metric) else: XW_dask = da.dot(X_dask, W.transpose()) if hl.ufunc_ is dummy: H0 = XW_dask elif hl.ufunc_ is np.tanh: H0 = da.tanh(XW_dask) else: H0 = XW_dask.map_blocks(hl.ufunc_) H_list.append(H0) if self.include_original_features: H_list.append(X_dask) H_list.append(da.ones((X_dask.shape[0], 1))) H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_) return H_dask def _compute(self, X, y, sync_every, HH=None, HY=None): """Computing matrices HH and HY, the actually long part. .. todo: actually distributed computations that scatter batches of data file names, and reduce-sum the HH,HY matrices. """ # processing files for i, X_file, y_file in zip(range(len(X)), X, y): X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True) Y_dask = dd.read_parquet(y_file).to_dask_array(lengths=True) H_dask = self._project(X_dask) if HH is None: # first iteration HH = da.dot(H_dask.transpose(), H_dask) HY = da.dot(H_dask.transpose(), Y_dask) else: HH += da.dot(H_dask.transpose(), H_dask) HY += da.dot(H_dask.transpose(), Y_dask) if sync_every is not None and i % sync_every == 0: wait([HH, HY]) # synchronization if sync_every is not None and i % sync_every == 0: HH, HY = self.client_.persist([HH, HY]) # finishing solution if sync_every is not None: wait([HH, HY]) return HH, HY def _solve(self, HH, HY): """Compute output weights from HH and HY using Dask functionality. """ # make HH/HY divisible by chunk size n_features, _ = HH.shape padding = 0 if n_features > self.bsize_ and n_features % self.bsize_ > 0: print("Adjusting batch size {} to n_features {}".format( self.bsize_, n_features)) padding = self.bsize_ - (n_features % self.bsize_) P01 = da.zeros((n_features, padding)) P10 = da.zeros((padding, n_features)) P11 = da.zeros((padding, padding)) HH = da.block([[HH, P01], [P10, P11]]) P1 = da.zeros((padding, HY.shape[1])) HY = da.block([[HY], [P1]]) # rechunk, add bias, and solve HH = HH.rechunk( self.bsize_) + self.alpha * da.eye(HH.shape[1], chunks=self.bsize_) HY = HY.rechunk(self.bsize_) B = da.linalg.solve(HH, HY, sym_pos=True) if padding > 0: B = B[:n_features] return B def fit(self, X, y=None, sync_every=10): """Fits an ELM with data in a bunch of files. Model will use the set of features from the first file. Same features must have same names across the whole dataset. .. todo: Check what happens if features are in different order or missing. Does **not** support sparse data. .. todo: Check if some sparse data would work. .. todo: Check that sync_every does not affect results .. todo: Add single precision .. todo: Parquet file format examples in documentation Original features and bias are added to the end of data, for easier rechunk-merge. This way full chunks of hidden neuron outputs stay intact. Parameters ---------- X : [str] List of input data files in Parquet format. y : [str] List of target data files in Parquet format. sync_every : int or None Synchronize computations after this many files are processed. None for running without synchronization. Less synchronization improves run speed with smaller data files, but may result in large swap space usage for large data problems. Use smaller number for more frequent synchronization if swap space becomes a problem. """ if not _is_list_of_strings(X) or not _is_list_of_strings(y): raise ValueError("Expected X and y as lists of file names.") if len(X) != len(y): raise ValueError( "Expected X and y as lists of files with the same length. " "Got len(X)={} and len(y)={}".format(len(X), len(y))) # read first file and get parameters X_dask = dd.read_parquet(X[0]).to_dask_array(lengths=True) Y_dask = dd.read_parquet(y[0]).to_dask_array(lengths=True) n_samples, n_features = X_dask.shape if hasattr(self, 'n_features_') and self.n_features_ != n_features: raise ValueError( 'Shape of input is different from what was seen in `fit`') _, n_outputs = Y_dask.shape if hasattr(self, 'n_outputs_') and self.n_outputs_ != n_outputs: raise ValueError( 'Shape of outputs is different from what was seen in `fit`') # set batch size, default is bsize=2000 or all-at-once with less than 10_000 samples self.bsize_ = self.batch_size if self.bsize_ is None: self.bsize_ = n_samples if n_samples < 10 * 1000 else 2000 # init model if not fit yet if not hasattr(self, 'hidden_layers_'): self.n_features_ = n_features self.n_outputs_ = n_outputs X_sample = X_dask[:10].compute() self._init_hidden_layers(X_sample) self._setup_dask_client() HH, HY = self._compute(X, y, sync_every=sync_every) self.B = self._solve(HH, HY) self.is_fitted_ = True return self def predict(self, X): """Prediction works with both lists of Parquet files and numeric arrays. Parameters ---------- X : array-like, [str] Input data as list of Parquet files, or as a numeric array. Returns ------- Yh : array, shape (n_samples, n_outputs) Predicted values for all input samples. .. attention:: Returns all outputs as a single in-memory array! Danger of running out out memory for high-dimensional outputs, if a large set of input files is provided. Feed data in smaller batches in such case. """ check_is_fitted(self, 'is_fitted_') if _is_list_of_strings(X): Yh_list = [] # processing files for X_file in X: X_dask = dd.read_parquet(X_file).to_dask_array(lengths=True) H_dask = self._project(X_dask) Yh_list.append(da.dot(H_dask, self.B)) Yh_dask = da.concatenate(Yh_list, axis=0) return Yh_dask.compute() else: X = check_array(X, accept_sparse=True) H = [np.ones((X.shape[0], 1))] if self.include_original_features: H.append(_dense(X)) H.extend([hl.transform(X) for hl in self.hidden_layers_]) return np.hstack(H) @ self.B.compute()
lon=da.ravel(lon) lon=lon.astype(int) lat=lat.astype(int) cm=cm.astype(int) Lat=lat.to_dask_dataframe() Lon=lon.to_dask_dataframe() CM=cm.to_dask_dataframe() df=dd.concat([Lat,Lon,CM],axis=1,interleave_partitions=False) cols = {0:'Latitude',1:'Longitude',2:'CM'} df = df.rename(columns=cols) df=client.persist(df) df2=delayed(df.groupby(['Longitude','Latitude']).CM.apply(countzero).reset_index()) df3=df2.compute() #print(gc.collect()) #print(df2) #df3=client.compute(df2) #print(df3) #tt=client.gather(df3) #print(tt) client.close() combs=[] for x in range(-89,91): for y in range(-179,181): combs.append((x, y))
#Filter nodes and edges to keep only papers from 2015 onwards papers = dd.read_csv('table_a01_articles.csv') papers = papers[papers['Journal_JournalIssue_PubDate_Year'] > 2015] papers = papers[['PMID']] papers['PMID'] = papers['PMID'].astype(str) papers = papers.compute() edges = dd.read_csv('table_a14_reference_list.csv', low_memory=False, blocksize=128000000, dtype={'RefArticleId': 'object'}) #pendiente: subset edges = edges.fillna(-1) edges['PMID'] = edges['PMID'].astype(str) edges['RefArticleId'] = edges['RefArticleId'].astype(str) edges = client.persist(edges) edges_ij_pmid = dd.merge(edges, papers, on=["PMID"]).compute() papers.columns = ['RefArticleId'] edges_final = dd.merge(edges_ij_pmid, papers, on=['RefArticleId']) papers.columns = ['PMID'] #Paper nodes papers = dd.read_csv('table_a01_articles.csv') papers = papers[papers['Journal_JournalIssue_PubDate_Year'] > 2015] papers['PMID'] = papers['PMID'].astype(str) papers = papers.compute() papers = papers.sort_values(axis=0, by='PMID', ascending=True) #Reindexing dictionary
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha/np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("Building Augmented Operator...") proj_operator_real = np.real(gamma).astype(np.float32) proj_operator_imag = np.imag(gamma).astype(np.float32) gamma = None proj_operator = np.block([[proj_operator_real], [proj_operator_imag]]) proj_operator_real = None proj_operator_imag = None logger.info('augmented: {}'.format(proj_operator.shape)) vis_aux = np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32) logger.info('vis mean: {} shape: {}'.format(np.mean(vis_aux), vis_aux.shape)) logger.info("Solving...") reg = linear_model.ElasticNet(alpha=lambduh, l1_ratio=0.05, max_iter=10000, positive=True) reg.fit(proj_operator, vis_aux) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info('Loss function: {}'.format(score)) else: from dask_ml.linear_model import LinearRegression import dask_glm import dask.array as da from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info('Starting Dask Client') if True: cluster = LocalCluster(dashboard_address=':8231', processes=False) client = Client(cluster) else: client = Client('tcp://localhost:8786') logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2*np.pi*1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array(np.exp(p2j*(u*dl + v*dm + w*n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s,)) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info('Gamma Shape: {}'.format(gamma.shape)) #gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info('Gamma Shape: {}'.format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array(np.array(np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32)) #logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() #dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) #dv = da.from_array(vis_aux) dask.config.set({'array.chunk-size': '1024MiB'}) A = da.rechunk(proj_operator, chunks=('auto', n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) #sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression(penalty=en, C=1.0/lambduh, fit_intercept=False, solver='lbfgs', max_iter=1000, tol=1e-8 ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info('Loss function: {}'.format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=True) return sky.reshape(-1,1)
base_model = dxgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', verbosity=3, n_jobs=-1, n_estimators=1000, learning_rate=0.010, max_depth=0, max_leaves=4, grow_policy='lossguide') with joblib.parallel_backend('dask'): base_model.fit(X_train, y_train.flatten()) #base_model.save_model('base_line_no_max_deph_lr_%f_%i.model'%(lr,leaves)) # predictions = base_model.predict(X_test) predictions = client.persist(predictions) # #print ("########") #print ("R^2:",r2_score(y_test.compute(), predictions.compute())) #print ("MAE:",mean_absolute_error(y_test.compute(), predictions.compute())) #print ("MSE:",mean_squared_error(y_test.compute(), predictions.compute())) p = predictions.to_dask_dataframe(columns=rounds[choice]["test"]) p.to_csv("my_result_for_%s_SubCh2" % (rounds[choice]["test"][0])) #parameters_for_testing = { # 'colsample_bytree':[0.4,0.6,0.8], # 'gamma':[0,0.03,0.1,0.3], # 'min_child_weight':[1.5,6,10], # 'learning_rate':[0.1,0.07], # 'max_depth':[3,5],
# path datadir = getDirectory("/data") # get all filenames = getFiles("*listings.csv.gz", datadir) for fname in filenames: try: # read csv df = dd.read_csv(fname, dtype=dtypes, compression="gzip", engine="python", encoding='utf-8', assume_missing=True, sample=1024 * 1024, error_bad_lines=False) # select needed feilds data = df[selected] # load df in RAM data = client.persist(data) #- Write to csv, replace null by nan, no index data.repartition(npartitions=1).to_csv(str(fname) + '_*.csv', na_rep='nan', index=False) except Exception as e: print(e) print("Done")
class LightGBMDaskLocal: # https://github.com/Nixtla/mlforecast/blob/main/nbs/distributed.forecast.ipynb """ persist call: data = self.client.persist(data) (assignment replaces old lazy array, as persist does not change the input in-place) To reduce the risk of hitting memory limits, consider restarting each worker process before running any data loading or training code. self.client.restart() - This function will restart each of the worker processes, clearing out anything they’re holding in memory. This function does NOT restart the actual machines of your cluster, so it runs very quickly. - should the workers just be killed regardless of whether the whole process was successful or unsuccessful (sort of a clean up action)? can restarting be that cleanup action? loop over hyperparameter values (method that accepts hyperparameters as a dictionary - initializes self.model = DaskLGBMRegressor() with each set of parameters and calls the method that loops over ) loop over train-valdation sets run model's fit method and compute predicted values and RMSE """ def __init__( self, curr_dt_time, n_workers, s3_path, startmonth, n_months_in_first_train_set, n_months_in_val_set, frac=None, ): self.curr_dt_time = curr_dt_time self.startmonth = startmonth self.n_months_in_first_train_set = n_months_in_first_train_set self.n_months_in_val_set = n_months_in_val_set self.frac = frac if frac is not None else 1.0 cluster = LocalCluster(n_workers=n_workers) self.client = Client(cluster) self.client.wait_for_workers(n_workers) print(f"***VIEW THE DASHBOARD HERE***: {cluster.dashboard_link}") # self.pca_transformed = ___ # call PCA code that returns numpy array here # (rename self.pca_transformed to self.full_dataset) # numpy array can also be created from the saved (pickle) file # for data: # instead of first looping over hyperparameter values and then over different # train-validation sets, is it better to do it in the opposite order # to allow for one set of train-validation data to be created only once? try: # this commented out code did not work without the meta= argument, # meta= was not tried as it needs all other columns listed, in # addition to the ones being recast # self.full_dataset = self.client.persist( # dd.read_parquet( # s3_path, index=False, engine="pyarrow" # ) # .sample(frac=self.frac, random_state=42) # .map_partitions( # self.cast_types, # meta={ # 'sid_shop_item_qty_sold_day': 'i2', # **{f'cat{n}': 'i2' for n in range(1,23)} # } # ) # .map_partitions(self.drop_neg_qty_sold) # .set_index( # "sale_date", sorted=False, npartitions="auto" # ) # .repartition(partition_size="100MB") # ) # create Dask dataframe from partitioned Parquet dataset on S3 and persist it to cluster self.full_dataset = dd.read_parquet(s3_path, index=False, engine="pyarrow").sample( frac=self.frac, random_state=42) self.full_dataset["sale_date"] = self.full_dataset[ "sale_date"].astype("datetime64[ns]") self.full_dataset[ "sid_shop_item_qty_sold_day"] = self.full_dataset[ "sid_shop_item_qty_sold_day"].astype("int16") for col in self.full_dataset: if col.startswith("cat"): self.full_dataset[col] = self.full_dataset[col].astype( "int16") logging.debug( f"# of rows in full dataframe before removal of negative target values: {len(self.full_dataset)}" ) self.full_dataset = self.full_dataset[ self.full_dataset.sid_shop_item_qty_sold_day >= 0] # call dataframe.set_index(), then repartition, then persist # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.set_index.html # set_index(sorted=False, npartitions='auto') # df = df.repartition(npartitions=df.npartitions // 100) # self.full_dataset = self.client.persist(self.full_dataset) # _ = wait([self.full_dataset]) # https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html # self.full_dataset = self.full_dataset.repartition(partition_size="100MB") self.full_dataset = self.full_dataset.set_index( "sale_date", sorted=False, npartitions="auto", partition_size=100_000_000, ) # partition_size for set_index: int, optional, desired size of # eaach partition in bytes (to be used with npartitions='auto') self.full_dataset = self.cull_empty_partitions(self.full_dataset) self.full_dataset = self.client.persist(self.full_dataset) _ = wait([self.full_dataset]) logging.debug( f"# of rows in full dataframe after removal of negative target values: {len(self.full_dataset)}" ) logging.debug( f"Earliest and latest dates in full dataframe are : {dd.compute(self.full_dataset.index.min(), self.full_dataset.index.max())}" ) logging.debug( f"Data types of full Dask dataframe are: {self.full_dataset.dtypes}" ) except Exception: logging.exception( "Exception occurred while creating Dask dataframe and persisting it on the cluster." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # finally: # self.client.restart() # sys.exit(1) # https://stackoverflow.com/questions/58437182/how-to-read-a-single-large-parquet-file-into-multiple-partitions-using-dask-dask # Parquet datasets can be saved into separate files. # Each file may contain separate row groups. # Dask Dataframe reads each Parquet row group into a separate partition. # I DON'T WANT TO KEEP THE NUMPY ARRAY IN MEMORY, SO IT NEEDS TO BE # DELETED AFTER DASK ARRAY IS CREATED # MIGHT BE BETTER TO CREATE DASK ARRAY FROM FILE ON S3, TO AVOID # HAVING BOTH NUMPY ARRAY AND PERSISTED DASK ARRAY IN MEMORY # I ALSO WANT TO SPLIT THAT NUMPY ARRAY INTO MULTIPLE TRAIN AND VALIDATION # SETS, SO WHAT'S THE BEST WAY TO DO THAT? # SEND THE ENTIRE ARRAY TO THE CLUSTER AT ONCE - PROBABLY NOT, OR # SEND TRAIN AND VALIDATION SETS ONE BY ONE AND DELETE? # BUT THAT WILL REQUIRE SENDING DATA TO THE CLUSTER MULTIPLE TIMES - # NOT IF THE DATA BEING SENT ARE DIFFERENT EACH TIME # THEY ARE NOT GOING TO BE COMPLETELY DIFFERENT BECAUSE TRAIN DATA WILL # JUST CONTINUE TO MERGE WITH VALIDATION SETS AND GROW # CREATE FIRST DASK ARRAY AND SEND TO CLUSTER, THEN APPEND TO IT? # IT DOES NOT LOOK LIKE DASK WOULD ALLOW THAT (SEE # https://github.com/dask/distributed/issues/1676 - # "You should also be aware that the task/data model underlying dask # arrays is immutable. You should never try to modify memory in-place.") # SO PROBABLY SEND ALL OF THE DATA TO THE CLUSTER AT THE BEGINNING, # THEN TAKE CHUNKS OF IT FOR WALK-FORWARD VALIDATION # PROBABLY SHOULD RELY ON LOADING DATA FROM FILE USING DELAYED / # FROM_DELAYED # SEE https://stackoverflow.com/questions/45941528/how-to-efficiently-send-a-large-numpy-array-to-the-cluster-with-dask-array) # can I use a function to read multiple files into one Dask array? # either figure out how to read multiple files (saved on S3) into one # Dask array, or # figure out how to save one array of PCA results to S3 (need disk space # to save it locally before transfer to S3 and need a method that can # handle transfer of more than 5GB - multipart transfer to S3) # try to write PCA-transformed data directly to zarr array (stored in memory) # then upload it to S3 (directly from memory) # then create dask array from that zarr array in S3 # try to write PCA-transformed data to xarray then upload it to S3 as zarr # save numpy array to parquet file, upload that file to S3 (using upload_file), # then read that file into a Dask dataframe # write data to parquet on S3 from pandas dataframe and append to it using awswrangler library? # (https://github.com/awslabs/aws-data-wrangler/blob/main/tutorials/004%20-%20Parquet%20Datasets.ipynb) # df = dd.read_parquet('s3://bucket/my-parquet-data') # (https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html#dask.dataframe.read_parquet) # from above link: # engine argument: If ‘pyarrow’ or ‘pyarrow-dataset’ is specified, the ArrowDatasetEngine (which leverages the pyarrow.dataset API) will be used. # read partitioned parquet dataset with Dask: # https://stackoverflow.com/questions/67222212/read-partitioned-parquet-dataset-written-by-spark-using-dask-and-pyarrow-dataset # def cast_types(self, df): # df = df.copy() # df['sale_date'] = df["sale_date"].astype( # "datetime64[ns]" # ) # for col in df: # if col.startswith("cat") or (col == "sid_shop_item_qty_sold_day"): # df[col] = df[col].astype("int16") # return df # # def drop_neg_qty_sold(self, df): # return df[df.sid_shop_item_qty_sold_day >= 0].copy() # function from https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask def cull_empty_partitions(self, ddf): ll = list(ddf.map_partitions(len).compute()) ddf_delayed = ddf.to_delayed() ddf_delayed_new = list() pempty = None for ix, n in enumerate(ll): if 0 == n: pempty = ddf.get_partition(ix) else: ddf_delayed_new.append(ddf_delayed[ix]) if pempty is not None: ddf = dd.from_delayed(ddf_delayed_new, meta=pempty) return ddf def gridsearch_wfv(self, params): # self.hyperparameters = hyperparameters # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in # the self.hyper_dict dictionary with value containing list of RMSE values self.all_params_combs = list() # determine if there is more than one combination of hyperparameters # if only one combination, set get_stats_ flag to True self.get_stats_ = (len(params[max(params, key=lambda x: len(params[x]))]) == 1) for params_comb_dict in (dict( zip(params.keys(), v)) for v in list(product(*list(params.values())))): # for self.hyper_dict in hyperparameters: # self.params_combs_list.append(params_comb_dict) self.params_comb_dict = params_comb_dict.copy() self.params_comb_dict["rmse_list_"] = list() self.params_comb_dict["monthly_rmse_list_"] = list() self.params_comb_dict["fit_times_list_"] = list() try: self.model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **params_comb_dict, ) except Exception: logging.exception( "Exception occurred while initializing Dask model.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # call method that loops over train-validation sets with performance_report( filename=f"dask_report_{self.curr_dt_time}.html"): for train, test, get_stats in self.train_test_time_split(): self.fit(train).predict(test).rmse_all_folds( test, get_stats) self.params_comb_dict["avg_rmse_"] = mean( self.params_comb_dict["rmse_list_"]) self.params_comb_dict["monthly_avg_rmse_"] = mean( self.params_comb_dict["monthly_rmse_list_"]) self.all_params_combs.append(self.params_comb_dict) best_params = min(self.all_params_combs, key=lambda x: x["monthly_avg_rmse_"]) self.best_score_ = best_params["monthly_avg_rmse_"] # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.) self.best_params_ = { k: v for k, v in best_params.items() if k in params } # save list of parameter-result dictionaries to dataframe and then to CSV if self.all_params_combs: all_params_combs_df = pd.DataFrame(self.all_params_combs) output_csv = "all_params_combs.csv" all_params_combs_df.to_csv(output_csv, index=False) try: key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv" # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_csv, "sales-demand-data", key) logging.info( "Name of CSV uploaded to S3 and containing all parameter combinations " f"and results is: {key}") except ClientError as e: logging.exception( "CSV file with LightGBM parameter combinations and results was not copied to S3." ) else: logging.debug( "List of parameter-result dictionaries is empty and was not converted to CSV!" ) # probably do the opposite: # loop over train-validation splits (persisting that data in memory) # and run different models on one # split, saving the results that can later be aggregated # is it possible to read the full range of dates needed for time # series validation and then drop/delete rows from array or # move some rows to another array: # start with July-September (train) + October (validation), # then remove October and move September from train to validation # def time_split(self): # return ( # self.full_dataset.loc[:self.end_date], # self.full_dataset.loc[self.end_date + timedelta(days=1):self.end_date + relativedelta(months=self.n_months_in_val_set, day=31)] # # self.full_dataset[date > self.end_date & date <= self.end_date + relativedelta(months=n_months_in_val_set, day=31)] # # less than or equal to last day of month currently used for validation # ) def train_test_time_split(self): # first (earliest) month: July 2015 # number of months in first train set: 1 # number of months in validation set: 2 # # number of months between Oct 2015 and July 2015: 3 # 3 - (2 - 1) = 2 (two 2-month intervals inside a 3-month interval) # (where 2 is the number of months in validation set) # (3 - n_months_in_first_train_set + 1) - (2 - 1) n_val_sets = ( month_counter( self.startmonth) # self.startmonth is e.g. July 1, 2015 - self.n_months_in_first_train_set + 1) - (self.n_months_in_val_set - 1) for m in range(n_val_sets): end_date = self.startmonth + relativedelta( months=m + self.n_months_in_first_train_set - 1, day=31) if self.get_stats_: get_stats = m == n_val_sets - 1 else: get_stats = False yield (self.full_dataset.loc[:end_date], self.full_dataset. loc[end_date + timedelta(days=1):end_date + relativedelta(months=self.n_months_in_val_set, day=31)], get_stats) # self.train, self.test = self.time_split(self.full_dataset, self.end_date) def get_sample_weights(self, train): weights_arr = train["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True).astype('float32') weights_arr = da.where(weights_arr == 0, self.params_comb_dict['weight_for_zeros'], 1.) return weights_arr def fit(self, train): try: start_time = time.perf_counter() logging.debug( f"train X dtypes are {train[[col for col in train if col.startswith(('pc','cat'))]].dtypes}" ) logging.debug( f"train y type is {train['sid_shop_item_qty_sold_day'].dtype}") self.model.fit( train[[col for col in train if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), train["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True), sample_weight=self.get_sample_weights(train), feature_name=[ col for col in train if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in train if col.startswith("cat") ], ) assert self.model.fitted_ self.params_comb_dict["fit_times_list_"].append( time.perf_counter() - start_time) return self except Exception: logging.exception( "Exception occurred while fitting model on train data during walk-forward validation." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def predict(self, test): try: self.y_pred = self.model.predict( test[[col for col in test if col.startswith(("pc", "cat"))]]) return self except Exception: logging.exception( "Exception occurred while computing predicted values on the test data." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def rmse_all_folds(self, test, get_stats): try: # logging.debug(f"Data type of test['sid_shop_item_qty_sold_day'] is: {type(test['sid_shop_item_qty_sold_day'])}") # logging.debug(f"Data type of self.y_pred is: {type(self.y_pred)}") # logging.debug(f"Shape of test['sid_shop_item_qty_sold_day'] is: {test['sid_shop_item_qty_sold_day'].compute().shape}") # logging.debug(f"Shape of self.y_pred is: {self.y_pred.compute().shape}") self.params_comb_dict["rmse_list_"].append( calc_rmse( test["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True), self.y_pred.compute_chunk_sizes(), get_stats, )) # self.rmse_results[json.dumps(self.hyper_dict)].append(calc_rmse(test[["sid_shop_item_qty_sold_day"]], self.y_pred)) self.params_comb_dict["monthly_rmse_list_"].append( calc_monthly_rmse( test[["shop_id", "item_id", "sid_shop_item_qty_sold_day"]], self.y_pred, )) except Exception: logging.exception( "Exception occurred while computing RMSE on the test data.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) def refit_and_save(self, model_path): """ https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076 """ try: self.best_model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **self.best_params_, ) self.best_model.fit( self.full_dataset[[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True, ), sample_weight=self.get_sample_weights(self.full_dataset), feature_name=[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in self.full_dataset if col.startswith("cat") ], ) output_txt = str(model_path).split("/")[-1] booster = self.best_model.booster_.save_model(output_txt) # output_txt = str(model_path).split('/')[-1] # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_txt, "sales-demand-data", output_txt) logging.info( f"Name of saved model uploaded to S3 is: {output_txt}") except (Exception, ClientError): logging.exception( "Exception occurred while fitting model on the full dataset and saving the booster to file on S3." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1)
# Trim the input to a bit larger than the target period for the rolling # average, making sure we have full days ds = ds.sel(time=slice('19791201', '20100131T2300')) print("Analysing %.2f GB" % (ds.mx2t.nbytes / (1024**3))) # Pre-process the input timeseries, then trim to the target date range rolled = rolling_maximum(ds.mx2t).sel(time=slice('19800101', '20100101')) # Run a percentile on each day of the year doy_p90 = (rolled.groupby('time.dayofyear').reduce(dask_percentile, dim='time', q=90, allow_lazy=True)) # Convert to a Dataset and save the output doy_p90 = doy_p90.to_dataset(name='mx2t_doy_p90') future = client.persist(doy_p90.to_netcdf('mx2t_doy_p90.nc', compute=False)) # Uncomment for a progress bar: # progress(future) future.compute() end = time.perf_counter() print() print("time", end - start) client.close()
from dask.datasets import timeseries import time from dask.dataframe.shuffle import shuffle from dask.distributed import Client, wait if __name__ == "__main__": client = Client("127.0.0.1:8786") ddf_h = timeseries(start='2000-01-01', end='2000-01-02', partition_freq='1min') result = shuffle(ddf_h, "id", shuffle="tasks") ddf = client.persist(result) _ = wait(ddf) client.shutdown() time.sleep(0.5)
def main(args): """ Initialising launch sequence. """ # ------------------------------------------------------ # Print some stuff to show that the code is running: print("") os.system( "printf 'A demonstration of a \033[5mDPrepB/DPrepC\033[m SDP pipeline\n'" ) print("") # Set the directory for the moment images: MOMENTS_DIR = args.outputs + '/MOMENTS' # Check that the output directories exist, if not then create: os.makedirs(args.outputs, exist_ok=True) os.makedirs(MOMENTS_DIR, exist_ok=True) # Set the polarisation definition of the instrument: POLDEF = init_inst(args.inst) # Setup Variables for SIP services # ------------------------------------------------------ # Define the Queue Producer settings: if args.queues: queue_settings = { 'bootstrap.servers': 'scheduler:9092', 'message.max.bytes': 100000000 } #10.60.253.31:9092 # Setup the Confluent Kafka Queue # ------------------------------------------------------ if args.queues: from confluent_kafka import Producer import pickle # Create an SDP queue: sip_queue = Producer(queue_settings) # Define a Data Array Format # ------------------------------------------------------ def gen_data(channel): return np.array([ vis1[channel], vis2[channel], channel, None, None, False, False, args.plots, float(args.uvcut), float(args.pixels), POLDEF, args.outputs, float(args.angres), None, None, None, None, None, None, args.twod, npixel_advice, cell_advice ]) # Setup the Dask Cluster # ------------------------------------------------------ starttime = t.time() dask.config.set(get=dask.distributed.Client.get) client = Client( args.daskaddress) # scheduler for Docker container, localhost for P3. print("Dask Client details:") print(client) print("") # Define channel range for 1 subband, each containing 40 channels: channel_range = np.array(range(int(args.channels))) # Load the data into memory: """ The input data should be interfaced with Buffer Management. """ print("Loading data:") print("") vis1 = [ load('%s/%s' % (args.inputs, args.ms1), range(channel, channel + 1), POLDEF) for channel in range(0, int(args.channels)) ] vis2 = [ load('%s/%s' % (args.inputs, args.ms2), range(channel, channel + 1), POLDEF) for channel in range(0, int(args.channels)) ] # Prepare Measurement Set # ------------------------------------------------------ # Combine MSSS snapshots: vis_advice = append_visibility(vis1[0], vis2[0]) # Apply a uv-distance cut to the data: vis_advice = uv_cut(vis_advice, float(args.uvcut)) npixel_advice, cell_advice = uv_advice(vis_advice, float(args.uvcut), float(args.pixels)) # Begin imaging via the Dask cluster # ------------------------------------------------------ # Submit data for each channel to the client, and return an image: # Scatter all the data in advance to all the workers: """ The data here could be passed via Data Queues. Queues may not be ideal. Data throughput challenges. Need to think more about the optimum approach. """ print("Scatter data to workers:") print("") big_job = [client.scatter(gen_data(channel)) for channel in channel_range] # Submit jobs to the cluster and create a list of futures: futures = [ client.submit(dprepb_imaging, big_job[channel], pure=False, retries=3) for channel in channel_range ] """ The dprepb_imaging function could generate QA, logging, and pass this information via Data Queues. Queues work well for this. Python logging calls are preferable. Send them to a text file on the node. Run another service that watches that file. Or just read from standard out. The Dockerisation will assist with logs. """ print("Imaging on workers:") # Watch progress: progress(futures) # Wait until all futures are complete: wait(futures) # Check that no futures have errors, if so resubmit: for future in futures: if future.status == 'error': print("ERROR: Future", future, "has 'error' status, as:") print(client.recreate_error_locally(future)) print("Rerunning...") print("") index = futures.index(future) futures[index].cancel() futures[index] = client.submit(dprepb_imaging, big_job[index], pure=False, retries=3) # Wait until all futures are complete: wait(futures) # Gather results from the futures: results = client.gather(futures, errors='raise') # Run QA on ARL objects and produce to queue: if args.queues: print("Adding QA to queue:") for result in results: sip_queue.produce('qa', pickle.dumps(qa_image(result), protocol=2)) sip_queue.flush() # Return the data element of each ARL object, as a Dask future: futures = [ client.submit(arl_data_future, result, pure=False, retries=3) for result in results ] progress(futures) wait(futures) # Calculate the Moment images # ------------------------------------------------------ # Now use 'distributed Dask arrays' in order to parallelise the Moment image calculation: # Construct a small Dask array for every future: print("") print("Calculating Moment images:") print("") arrays = [ da.from_delayed(future, dtype=np.dtype('float64'), shape=(1, 4, 512, 512)) for future in futures ] # Stack all small Dask arrays into one: stack = da.stack(arrays, axis=0) # Combine chunks to reduce overhead - is initially (40, 1, 4, 512, 512): stack = stack.rechunk((1, 1, 4, 64, 64)) # Spread the data around on the cluster: stack = client.persist(stack) # Data is now coordinated by the single logical Dask array, 'stack'. # Save the Moment images: """ The output moment images should be interfaced with Buffer Management. Need to know more about the Buffer specification. Related to initial data distribution also/staging. """ print("Saving Moment images to disk:") print("") # First generate a template: image_template = import_image_from_fits('%s/imaging_dirty_WStack-%s.fits' % (args.outputs, 0)) # Output mean images: # I: image_template.data = stack[:, :, 0, :, :].mean(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Mean-%s.fits' % (MOMENTS_DIR, 'I')) # Q: image_template.data = stack[:, :, 1, :, :].mean(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Mean-%s.fits' % (MOMENTS_DIR, 'Q')) # U: image_template.data = stack[:, :, 2, :, :].mean(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Mean-%s.fits' % (MOMENTS_DIR, 'U')) # P: image_template.data = da.sqrt( (da.square(stack[:, :, 1, :, :]) + da.square(stack[:, :, 2, :, :]))).mean(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Mean-%s.fits' % (MOMENTS_DIR, 'P')) # Output standard deviation images: # I: image_template.data = stack[:, :, 0, :, :].std(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'I')) # Q: image_template.data = stack[:, :, 1, :, :].std(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'Q')) # U: image_template.data = stack[:, :, 2, :, :].std(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'U')) # P: image_template.data = da.sqrt( (da.square(stack[:, :, 1, :, :]) + da.square(stack[:, :, 2, :, :]))).std(axis=0).compute() # Run QA on ARL objects and produce to queue: if args.queues: sip_queue.produce('qa', pickle.dumps(qa_image(image_template), protocol=2)) # Export the data to disk: export_image_to_fits(image_template, '%s/Std-%s.fits' % (MOMENTS_DIR, 'P')) # Flush queue: if args.queues: sip_queue.flush() # Make a tarball of moment images: subprocess.call([ 'tar', '-cvf', '%s/moment.tar' % (MOMENTS_DIR), '%s/' % (MOMENTS_DIR) ]) subprocess.call(['gzip', '-9f', '%s/moment.tar' % (MOMENTS_DIR)]) endtime = t.time() print(endtime - starttime)
# Remove the now redundant `day_of_year` and `time_of_day` columns: uoreg_df = uoreg_df.drop(['day_of_year', 'time_of_day'], axis=1) uoreg_df.head() # Sort by the timestamp: uoreg_df = uoreg_df.set_index('ts') uoreg_df.head() uoreg_df.tail() uoreg_df.visualize() # Save current dataframe in memory to avoid accumulating several operations on the dask graph uoreg_df = client.persist(uoreg_df) uoreg_df.visualize() # ## Exploring ambient temperature data # # Plotting the ambient temperature data to get an overview of it, try to visually identify possible outliers and check the points highlighted from each quality control flag value data = [ go.Scatter(x=uoreg_df.index.compute(), y=uoreg_df.ambient_temperature_1) ] layout = go.Layout(title='Ambient temperature 1') fig = go.FigureWidget(data, layout) fig uoreg_df.qlt_ctrl_flag_1.unique().compute()
def main(): client = Client(processes=False) # read dask.DataFrame with extracted data df = dd.read_parquet('data/interim/NWATL21_subset') # remove data with Med_depth < -10000 df = df.loc[df['Med_depth'] > -1e4] # remove data with certain Group_Id which have been identified to have # inconsistent depth values df = df.loc[(df['Group_Id'] != 'ect18-38') & (df['Group_Id'] != 'ch036l01') & (df['Group_Id'] != 'c2207') & (df['Group_Id'] != 'p885ns') & (df['Group_Id'] != 'a2091l01') & (df['Group_Id'] != 'kn151l4') & (df['Group_Id'] != 'KJACK2006') & (df['Group_Id'] != 'BROWNSBANK1996')] # make sure all depths are negative # df['Med_depth'] = -1 * df['Med_depth'].abs() # compute up to here, keep results in memory df = client.persist(df) # Some observations are referenced to LLWLT and need to be corrected. # Therefore, the dataset is split up into two subsets for further # processing. It is further assumed that the tidal correction does not # affect the accuracy for observations with Med_depth < -200. # only select coordinates and Med_depth outcols = ['Lon', 'Lat', 'Med_depth'] # observations relative to MSL or MWL - no correction needed. msldf = df.loc[(df['Vertical_ref'] == 'MSL:2005') | (df['Vertical_ref'] == 'MSL:2006') | (df['Vertical_ref'] == 'MWL:2006') | (((df['Vertical_ref'] == 'LLWLT:2005') | (df['Vertical_ref'] == 'LLWLT:2006') | (df['Vertical_ref'] == 'VER_DAT:LLWLT')) & (df['Med_depth'] < -200)), outcols].compute() # write output files store = pd.HDFStore('data/interim/NWATL21_subset_msl.h5') store.put('df', msldf, data_columns=msldf.columns) store.close() del msldf # observations relative to LLWLT - correction needed. llwltdf = df.loc[((df['Vertical_ref'] == 'LLWLT:2005') | (df['Vertical_ref'] == 'LLWLT:2006') | (df['Vertical_ref'] == 'VER_DAT:LLWLT')) & (df['Med_depth'] >= -200), outcols].compute() # write output files store = pd.HDFStore('data/interim/NWATL21_subset_llwlt.h5') store.put('df', llwltdf, data_columns=llwltdf.columns) store.close() del llwltdf