def parallel_calculate_chunks(cutoff_time, chunk_size, feature_set, approximate, training_window, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, progress_bar, dask_kwargs=None, progress_callback=None): from distributed import as_completed, Future from dask.base import tokenize client = None cluster = None try: client, cluster = create_client_and_cluster(n_jobs=n_jobs, dask_kwargs=dask_kwargs, entityset_size=entityset.__sizeof__()) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): msg = "Using EntitySet persisted on the cluster as dataset {}" progress_bar.write(msg.format(es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(feature_set) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) num_scattered_workers = len(client.who_has([Future(es_token)]).get(es_token, [])) num_workers = len(client.scheduler_info()['workers'].values()) chunks = cutoff_time.groupby(cutoff_df_time_var) if not chunk_size: chunk_size = _handle_chunk_size(1.0 / num_workers, cutoff_time.shape[0]) chunks = _chunk_dataframe_groups(chunks, chunk_size) chunks = [df for _, df in chunks] if len(chunks) < num_workers: chunk_warning = "Fewer chunks ({}), than workers ({}) consider reducing the chunk size" warning_string = chunk_warning.format(len(chunks), num_workers) progress_bar.write(warning_string) scatter_warning(num_scattered_workers, num_workers) end = time.time() scatter_time = round(end - start) # if enabled, reset timer after scatter for better time remaining estimates if not progress_bar.disable: progress_bar.reset() scatter_string = "EntitySet scattered to {} workers in {} seconds" progress_bar.write(scatter_string.format(num_scattered_workers, scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, feature_set=_saved_features, chunk_size=None, entityset=_es, approximate=approximate, training_window=training_window, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns, progress_bar=None, progress_callback=progress_callback) feature_matrix = [] iterator = as_completed(_chunks).batches() for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) previous_progress = progress_bar.n progress_bar.update(result.shape[0]) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters(progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) except Exception: raise finally: if client is not None: client.close() if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() feature_matrix = pd.concat(feature_matrix) return feature_matrix