def from_pandas(cls, df, return_dims=False): """Return the partitions from Pandas DataFrame.""" def update_bar(pbar, f): if ProgressBar.get(): pbar.update(1) return f num_splits = NPartitions.get() put_func = cls._partition_class.put row_chunksize, col_chunksize = compute_chunksize(df, num_splits) bar_format = ( "{l_bar}{bar}{r_bar}" if os.environ.get("DEBUG_PROGRESS_BAR", "False") == "True" else "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}" ) if ProgressBar.get(): with warnings.catch_warnings(): warnings.simplefilter("ignore") try: from tqdm.autonotebook import tqdm as tqdm_notebook except ImportError: raise ImportError( "Please pip install tqdm to use the progress bar") rows = max(1, round(len(df) / row_chunksize)) cols = max(1, round(len(df.columns) / col_chunksize)) update_count = rows * cols pbar = tqdm_notebook( total=round(update_count), desc="Distributing Dataframe", bar_format=bar_format, ) else: pbar = None parts = [[ update_bar( pbar, put_func(df.iloc[i:i + row_chunksize, j:j + col_chunksize].copy()), ) for j in range(0, len(df.columns), col_chunksize) ] for i in range(0, len(df), row_chunksize)] if ProgressBar.get(): pbar.close() if not return_dims: return np.array(parts) else: row_lengths = [ row_chunksize if i + row_chunksize < len(df) else len(df) % row_chunksize or row_chunksize for i in range(0, len(df), row_chunksize) ] col_widths = [ col_chunksize if i + col_chunksize < len(df.columns) else len(df.columns) % col_chunksize or col_chunksize for i in range(0, len(df.columns), col_chunksize) ] return np.array(parts), row_lengths, col_widths
def call_progress_bar(result_parts, line_no): with warnings.catch_warnings(): warnings.simplefilter("ignore") try: from tqdm.autonotebook import tqdm as tqdm_notebook except ImportError: raise ImportError( "Please pip install tqdm to use the progress bar") from IPython import get_ipython try: cell_no = get_ipython().execution_count # This happens if we are not in ipython or jupyter. # No progress bar is supported in that case. except AttributeError: return pbar_id = str(cell_no) + "-" + str(line_no) futures = [x.oid for row in result_parts for x in row] bar_format = ( "{l_bar}{bar}{r_bar}" if "DEBUG_PROGRESS_BAR" in os.environ and os.environ["DEBUG_PROGRESS_BAR"] == "True" else "{desc}: {percentage:3.0f}%{bar} Elapsed time: {elapsed}, estimated remaining time: {remaining}" ) bar_lock.acquire() if pbar_id in progress_bars: if hasattr(progress_bars[pbar_id], "container"): if hasattr(progress_bars[pbar_id].container.children[0], "max"): index = 0 else: index = 1 progress_bars[pbar_id].container.children[ index].max = progress_bars[pbar_id].container.children[ index].max + len(futures) progress_bars[pbar_id].total = progress_bars[pbar_id].total + len( futures) progress_bars[pbar_id].refresh() else: progress_bars[pbar_id] = tqdm_notebook( total=len(futures), desc="Estimated completion of line " + str(line_no), bar_format=bar_format, ) bar_lock.release() threading.Thread(target=show_time_updates, args=(progress_bars[pbar_id], )).start() for i in range(1, len(futures) + 1): ray.wait(futures, num_returns=i) progress_bars[pbar_id].update(1) progress_bars[pbar_id].refresh() if progress_bars[pbar_id].n == progress_bars[pbar_id].total: progress_bars[pbar_id].close()