def calc_word_freq( df: dd.DataFrame, top_words: int = 30, stopword: bool = True, lemmatize: bool = False, stem: bool = False, ) -> Dict[str, Any]: """ Parse a categorical column of text data into words, and then compute the frequency distribution of words and the total number of words. Parameters ---------- df Groupby-count on the categorical column as a dataframe top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ col = df.columns[0] if stopword: # use a regex to replace stop words and non-alphanumeric characters with empty string df[col] = df[col].str.replace(fr"\b(?:{'|'.join(ess)})\b|[^\w+ ]", "") else: df[col] = df[col].str.replace(r"[^\w+ ]", "") # convert to lowercase and split df[col] = df[col].str.lower().str.split() # "explode()" to "stack" all the words in a list into a new column df = df.explode(col) # lemmatize and stem if lemmatize or stem: df[col] = df[col].dropna() if lemmatize: lem = WordNetLemmatizer() df[col] = df[col].apply(lem.lemmatize, meta="object") if stem: porter = PorterStemmer() df[col] = df[col].apply(porter.stem, meta="object") # counts of words, excludes null values word_cnts = df.groupby(col)[df.columns[1]].sum() # total number of words nwords = word_cnts.sum() # total uniq words nuniq_words = word_cnts.shape[0] # words with the highest frequency fnl_word_cnts = word_cnts.nlargest(n=top_words) return { "word_cnts": fnl_word_cnts, "nwords": nwords, "nuniq_words": nuniq_words }
def reset_index_dask(ddf: dd.DataFrame) -> dd.DataFrame: return ddf.assign(idx=1)\ .assign(idx=lambda df: df.idx.cumsum() - 1)\ .set_index('idx', sorted=True)\ .map_partitions(lambda df: df.rename(index = {'idx': None}))
def missing_impact_1v1( # pylint: disable=too-many-locals df: dd.DataFrame, x: str, y: str, bins: int, ndist_sample: int, dtype: Optional[DTypeDef] = None, ) -> Intermediate: # pylint: disable=too-many-arguments """ Calculate the distribution change on another column y when the missing values in x is dropped. """ df0 = df[[x, y]] df1 = df.dropna(subset=[x]) srs0, srs1 = df0[y], df1[y] minimum, maximum = srs0.min(), srs0.max() hists = [ histogram(srs, dtype=dtype, bins=bins, return_edges=True) for srs in [srs0, srs1] ] hists = da.compute(*hists) meta = ColumnsMetadata() meta["y", "dtype"] = detect_dtype(df[y], dtype) if is_dtype(detect_dtype(df[y], dtype), Continuous()): dists = [rv_histogram((hist[0], hist[2])) for hist in hists] # type: ignore xs = np.linspace(minimum, maximum, ndist_sample) pdfs = [dist.pdf(xs) for dist in dists] cdfs = [dist.cdf(xs) for dist in dists] distdf = pd.DataFrame( { "x": np.tile(xs, 2), "pdf": np.concatenate(pdfs), "cdf": np.concatenate(cdfs), "label": np.repeat(LABELS, ndist_sample), } ) counts, xs, edges = zip(*hists) lower_bounds: List[float] = [] upper_bounds: List[float] = [] for edge in edges: lower_bounds.extend(edge[:-1]) upper_bounds.extend(edge[1:]) histdf = pd.DataFrame( { "x": np.concatenate(xs), "count": np.concatenate(counts), "label": np.repeat(LABELS, [len(count) for count in counts]), "lower_bound": lower_bounds, "upper_bound": upper_bounds, } ) quantiles = [ [srs.quantile(q) for q in [0, 0.25, 0.5, 0.75, 1]] for srs in [srs0, srs1] ] quantiles = dd.compute(*quantiles) boxdf = pd.DataFrame(quantiles) boxdf.columns = ["min", "q1", "q2", "q3", "max"] iqr = boxdf["q3"] - boxdf["q1"] boxdf["upper"] = np.minimum(boxdf["q3"] + 1.5 * iqr, boxdf["max"]) boxdf["lower"] = np.maximum(boxdf["q3"] - 1.5 * iqr, boxdf["min"]) boxdf["label"] = LABELS itmdt = Intermediate( dist=distdf, hist=histdf, box=boxdf, meta=meta["y"], x=x, y=y, visual_type="missing_impact_1v1", ) return itmdt else: counts, xs = zip(*hists) df = pd.DataFrame( { "x": np.concatenate(xs, axis=0), "count": np.concatenate(counts, axis=0), "label": np.repeat(LABELS, [len(count) for count in counts]), } ) # If the cardinality of a categorical column is too large, # we show the top `num_bins` values, sorted by their count before drop if len(counts[0]) > bins: sortidx = np.argsort(-counts[0]) selected_xs = xs[0][sortidx[:bins]] df = df[df["x"].isin(selected_xs)] partial = (bins, len(counts[0])) else: partial = (len(counts[0]), len(counts[0])) meta["y", "partial"] = partial itmdt = Intermediate( hist=df, x=x, y=y, meta=meta["y"], visual_type="missing_impact_1v1", ) return itmdt
def m_o(engine: DaskExecutionEngine, df: dd.DataFrame) -> None: assert 1 == df.compute().shape[0]
def feature_engineering(data: dd.DataFrame) -> dd.DataFrame: # data = data.repartition(npartitions=1) data = data.persist()
def _to_parquet(ddf: dd.DataFrame, savepath: Path): return ddf.to_parquet(savepath)
def lemmatize_tweets( self, tweet_dataframe: dask_dataframe ) -> Union[dask_dataframe, pd.DataFrame]: tweet_dataframe['text'] = tweet_dataframe.apply( lambda x: self._lemmatize(x['text']), axis=1, meta=str) return tweet_dataframe
def build_timedelta_features(ddf: dd.DataFrame, config: RunConfig) -> dd.DataFrame: """Builds features for time differences between records or from present.""" return (ddf.pipe(calculate_timedeltas, config).pipe(build_timedelta_disqualifiers, config).pipe(convert_timedeltas_to_days))
def remove_unneeded_columns(ddf: dd.DataFrame) -> dd.DataFrame: uneeded_columns = [ 'date_if_conviction', 'date_if_felony_conviction', 'is_class_1_or_2', 'is_class_3_or_4' ] return ddf.drop(uneeded_columns, axis=1)
def shuffle_store_dask_partitions( ddf: dd.DataFrame, table: str, secondary_indices: Optional[InferredIndices], metadata_version: int, partition_on: List[str], store_factory: StoreFactory, df_serializer: Optional[DataFrameSerializer], dataset_uuid: str, num_buckets: int, sort_partitions_by: List[str], bucket_by: Sequence[str], ) -> da.Array: """ Perform a dataset update with dask reshuffling to control partitioning. The shuffle operation will perform the following steps 1. Pack payload data Payload data is serialized and compressed into a single byte value using ``distributed.protocol.serialize_bytes``, see also ``pack_payload``. 2. Apply bucketing Hash the column subset ``bucket_by`` and distribute the hashes in ``num_buckets`` bins/buckets. Internally every bucket is identified by an integer and we will create one physical file for every bucket ID. The bucket ID is not exposed to the user and is dropped after the shuffle, before the store. This is done since we do not want to guarantee at the moment, that the hash function remains stable. 3. Perform shuffle (dask.DataFrame.groupby.apply) The groupby key will be the combination of ``partition_on`` fields and the hash bucket ID. This will create a physical file for every unique tuple in ``partition_on + bucket_ID``. The function which is applied to the dataframe will perform all necessary subtask for storage of the dataset (partition_on, index calc, etc.). 4. Unpack data (within the apply-function) After the shuffle, the first step is to unpack the payload data since the follow up tasks will require the full dataframe. 5. Pre storage processing and parquet serialization We apply important pre storage processing like sorting data, applying final partitioning (at this time there should be only one group in the payload data but using the ``MetaPartition.partition_on`` guarantees the appropriate data structures kartothek expects are created.). After the preprocessing is done, the data is serialized and stored as parquet. The applied function will return an (empty) MetaPartition with indices and metadata which will then be used to commit the dataset. Returns ------- A dask.Array holding relevant MetaPartition objects as values """ if ddf.npartitions == 0: return ddf group_cols = partition_on.copy() if num_buckets is None: raise ValueError( "``num_buckets`` must not be None when shuffling data.") meta = ddf._meta meta[_KTK_HASH_BUCKET] = np.uint64(0) ddf = ddf.map_partitions(_hash_bucket, bucket_by, num_buckets, meta=meta) group_cols.append(_KTK_HASH_BUCKET) unpacked_meta = ddf._meta ddf = pack_payload(ddf, group_key=group_cols) ddf_grouped = ddf.groupby(by=group_cols) unpack = partial( _unpack_store_partition, secondary_indices=secondary_indices, sort_partitions_by=sort_partitions_by, table=table, dataset_uuid=dataset_uuid, partition_on=partition_on, store_factory=store_factory, df_serializer=df_serializer, metadata_version=metadata_version, unpacked_meta=unpacked_meta, ) return cast( da. Array, # Output type depends on meta but mypy cannot infer this easily. ddf_grouped.apply(unpack, meta=("MetaPartition", "object")), )
def pack_payload(df: dd.DataFrame, group_key: Union[List[str], str]) -> dd.DataFrame: """ Pack all payload columns (everything except of group_key) into a single columns. This column will contain a single byte string containing the serialized and compressed payload data. The payload data is just dead weight when reshuffling. By compressing it once before the shuffle starts, this saves a lot of memory and network/disk IO. Example:: >>> import pandas as pd ... import dask.dataframe as dd ... from dask.dataframe.shuffle import pack_payload ... ... df = pd.DataFrame({"A": [1, 1] * 2 + [2, 2] * 2 + [3, 3] * 2, "B": range(12)}) ... ddf = dd.from_pandas(df, npartitions=2) >>> ddf.partitions[0].compute() A B 0 1 0 1 1 1 2 1 2 3 1 3 4 2 4 5 2 5 >>> pack_payload(ddf, "A").partitions[0].compute() A __dask_payload_bytes 0 1 b'\x03\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x03... 1 2 b'\x03\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x03... See also https://github.com/dask/dask/pull/6259 """ if ( # https://github.com/pandas-dev/pandas/issues/34455 isinstance(df._meta.index, pd.Float64Index) # TODO: Try to find out what's going on an file a bug report # For datetime indices the apply seems to be corrupt # s.t. apply(lambda x:x) returns different values or isinstance(df._meta.index, pd.DatetimeIndex)): return df if not HAS_DISTRIBUTED: _logger.warning( "Shuffle payload columns cannot be compressed since distributed is not installed." ) return df if not isinstance(group_key, list): group_key = [group_key] packed_meta = df._meta[group_key] packed_meta[_PAYLOAD_COL] = b"" _pack_payload = partial(pack_payload_pandas, group_key=group_key) return df.map_partitions(_pack_payload, meta=packed_meta)
def flatten_aggregated_columns(dd: DataFrame): """ API to make aggregated columns that are MultiIndex style flat Args: pd: target dataframe Returns: result pandas dataframe Examples: >>> import dask.dataframe >>> import pandas >>> pd = pandas.DataFrame({'a': [1, 10, 100, 1, 1, 100], 'b': range(0, 600, 100), 'key': [0, 1, 2, 0, 1, 2]}) >>> print(pd) ... # doctest: +NORMALIZE_WHITESPACE a b key 0 1 0 0 1 10 100 1 2 100 200 2 3 1 300 0 4 1 400 1 5 100 500 2 >>> dd = dask.dataframe.from_pandas(pd, npartitions=2) >>> groupby = dd.groupby(['key']) >>> groupby_result = groupby.agg({'a': ['sum', 'min'], 'b': ['mean', 'sum', 'max']}) >>> print(groupby_result.compute()) ... # doctest: +NORMALIZE_WHITESPACE a b sum min mean sum max key 0 2 1 150.0 300 300 1 11 1 250.0 500 400 2 200 100 350.0 700 500 >>> flatten = flatten_aggregated_columns(groupby_result) >>> print(flatten.compute()) ... # doctest: +NORMALIZE_WHITESPACE a_sum a_min b_mean b_sum b_max key 0 2 1 150.0 300 300 1 11 1 250.0 500 400 2 200 100 350.0 700 500 >>> print(flatten_aggregated_columns(dd).compute()) ... # doctest: +NORMALIZE_WHITESPACE a b key 0 1 0 0 1 10 100 1 2 100 200 2 3 1 300 0 4 1 400 1 5 100 500 2 """ if not isinstance(dd.columns, MultiIndex) or dd.columns.nlevels != 2: return dd result = dd.copy() columns = [] for l1, l2 in zip(dd.columns.get_level_values(0), dd.columns.get_level_values(1)): if l2 == '': columns.append(l1) else: columns.append('_'.join((l1, l2))) result.columns = columns return result
def normalize_column_names(df: dd.DataFrame, enabled) -> dd.DataFrame: if enabled: df.columns = normalize_names(df.columns) return df
def _apply_function_over( self, df: dd.DataFrame, f: Callable, operands: List[dd.Series], window: org.apache.calcite.rex.RexWindow, group_columns: List[str], sort_columns: List[str], sort_ascending: List[bool], sort_null_first: List[bool], ) -> Tuple[dd.DataFrame, str]: """Apply the given function over the dataframe, possibly grouped and sorted per group""" temporary_operand_columns = { new_temporary_column(df): operand for operand in operands } df = df.assign(**temporary_operand_columns) # Important: move as few bytes as possible to the pickled function, # which is evaluated on the workers temporary_operand_columns = temporary_operand_columns.keys() # Extract the window definition lower_bound = to_bound_description(window.getLowerBound()) upper_bound = to_bound_description(window.getUpperBound()) new_column_name = new_temporary_column(df) @make_pickable_without_dask_sql def map_on_each_group(partitioned_group): # Apply sorting if sort_columns: partitioned_group = sort_partition_func( partitioned_group, sort_columns, sort_ascending, sort_null_first ) if f is None: # This is the row_number operator. # We do not need to do any windowing column_result = range(1, len(partitioned_group) + 1) else: # In all other cases, apply the windowing operation if lower_bound.is_unbounded and ( upper_bound.is_current_row or upper_bound.offset == 0 ): windowed_group = partitioned_group.expanding(min_periods=0) elif lower_bound.is_preceding and ( upper_bound.is_current_row or upper_bound.offset == 0 ): windowed_group = partitioned_group.rolling( window=lower_bound.offset + 1, min_periods=0, ) else: lower_offset = ( lower_bound.offset if not lower_bound.is_current_row else 0 ) if lower_bound.is_preceding and lower_offset is not None: lower_offset *= -1 upper_offset = ( upper_bound.offset if not upper_bound.is_current_row else 0 ) if upper_bound.is_preceding and upper_offset is not None: upper_offset *= -1 indexer = Indexer(lower_offset, upper_offset) windowed_group = partitioned_group.rolling( window=indexer, min_periods=0 ) column_result = f(windowed_group, *temporary_operand_columns) partitioned_group = partitioned_group.assign( **{new_column_name: column_result} ) return partitioned_group # Currently, pandas will always return a float for windowing operations meta = df._meta_nonempty.assign(**{new_column_name: 0.0}) df = df.groupby(group_columns).apply(map_on_each_group, meta=meta) return df, new_column_name
def compute_final_dataframe(df: dd.DataFrame) -> pd.DataFrame: """Execute dask task graph and compute final results""" return (df.compute().reset_index().pivot(index='drive_time', columns='trip_distance', values='avg_amount').fillna(0))
def _15(obj: dd.DataFrame) -> KmerAlignFormat: ff = KmerAlignFormat() obj.to_csv(str(ff), sep='\t', index=False, single_file=True) return ff
def _slice_timeid_column(ddf: dd.DataFrame) -> dd.DataFrame: ddf["day"] = ddf["timeid"].str.slice(0, 3).astype("int16") ddf["halfhourly_id"] = ddf["timeid"].str.slice(3, 5).astype("int8") return ddf.drop(columns=["timeid"])
def _process_table_identifiers( pdf: DataFrame, dimension_combinations: Optional[List[List[str]]] = None, max_combination_length: int = 5) -> List[List[str]]: """ Dask wrapper around extracting identifiers from a single sampled table (pdf). This method submits multiple sub-tasks to identify possible identifier combinations, waits for them to complete and returns one or more dimension combinations. Note that the `worker_client` call forces the task to secede from the Worker's thread-pool, therefore it does not block any other computations and cannot cause a deadlock while waiting for sub-tasks to finish. """ with timed_block('[idparser] Computing number of rows took {:.3f} seconds', logger, logging.DEBUG): num_rows = len(pdf) with timed_block('[idparser] Pruning columns took {:.3f} seconds', logger, logging.DEBUG): # filter out columns that contain at least X% null values - null values can't be parts of the primary key columns = [ col for col, count in pdf.count().compute().items() if count / num_rows >= NON_NULL_VALUES_RATIO ] with worker_client(separate_thread=True) as client: # type: Client with timed_block( '[idparser] Generating combinations took {:.3f} seconds', logger, logging.DEBUG): # explore all possible dimension combinations if none are provided if dimension_combinations is None: all_possible_combinations = itertools.chain.from_iterable( itertools.combinations(columns, i) for i in range( 1, min(max_combination_length, len(columns)) + 1)) generated_combinations: List[List[str]] = [ sorted(combination) for combination in all_possible_combinations ] else: generated_combinations = dimension_combinations with timed_block( '[idparser] Waiting for all combination tasks took {:.3f} seconds', logger, logging.DEBUG): with timed_block( '[idparser] Submitting all combination tasks took {:.3f} seconds', logger, logging.DEBUG): # submit "per dimension combination" tasks futures = client.map( lambda combination: _process_possible_identifier_combination(pdf, combination), generated_combinations, key=[ f'comb_{combination}_{str(uuid4())}' for combination in generated_combinations ], # priority=100, # batch_size=32, retries=2, ) results = client.gather(futures) return [ dimensions for dimensions, num_duplicates in results if num_duplicates == 0 ]
def filter_stopwords( self, tweet_dataframe: dask_dataframe ) -> Union[dask_dataframe, pd.DataFrame]: tweet_dataframe['text'] = tweet_dataframe.apply( lambda x: self._remove_stopwords(x['text']), axis=1, meta=str) return tweet_dataframe
def map_partitions_as_meta(dd: DataFrame, func: Callable[..., pandas.DataFrame], meta: pandas.DataFrame, **kwargs): """ API to do map_partitions, and reformat result using meta. It may avoid error caused by map_partitions result and meta don't match. Args: dd (DataFrame): dask dataframe to do map_partitions. func (Callable[[[pandas.DataFrame, ...]], pandas.DataFrame]): function for map_partitions. meta (pandas.DataFrame): expected schema of map_partitions result. kwargs: additional arguments for func. Returns: result dask dataframe Examples: >>> import dask.dataframe >>> import pandas >>> pd = pandas.DataFrame({'a1': ['1,2,3', '2,3,4', '3,4'], 'a2': ['a,b,c', 'b,c,d', 'c,d'], ... 'b': [1, 2, 3], 'idx': [0, 1, 2]}).set_index('idx') >>> print(pd) ... # doctest: +NORMALIZE_WHITESPACE a1 a2 b idx 0 1,2,3 a,b,c 1 1 2,3,4 b,c,d 2 2 3,4 c,d 3 >>> transformer = lambda pd: pd[['b', 'a1', 'a2']] >>> dd = dask.dataframe.from_pandas(pd, npartitions=2) >>> result1 = dd.map_partitions(transformer) >>> print(result1.compute()) ... # doctest: +NORMALIZE_WHITESPACE b a1 a2 idx 0 1 1,2,3 a,b,c 1 2 2,3,4 b,c,d 2 3 3,4 c,d >>> meta = make_meta(('idx', 'int'), [('a1', 'object'), ('a2', 'object'), ('b', 'int')]) >>> result2 = map_partitions_as_meta(dd, transformer, meta) >>> print(result2.compute()) ... # doctest: +NORMALIZE_WHITESPACE a1 a2 b idx 0 1,2,3 a,b,c 1 1 2,3,4 b,c,d 2 2 3,4 c,d 3 >>> transformer2 = lambda pd, v: pd[['b']] * v >>> result3 = map_partitions_as_meta(dd, transformer2, make_meta(('idx', 'int'), [('b', 'int')]), ... v=100) >>> print(result3.compute()) ... # doctest: +NORMALIZE_WHITESPACE b idx 0 100 1 200 2 300 """ def apply_meta(pd: pandas.DataFrame): result = func(pd, **kwargs) meta_column_names = [c for c in meta.columns] return result[meta_column_names] return dd.map_partitions(apply_meta)
def calc_box( df: dd.DataFrame, bins: int, ngroups: int = 10, largest: bool = True ) -> Tuple[pd.DataFrame, List[str], List[float], Optional[Dict[str, int]]]: """ Compute a box plot over either 1) the values in one column 2) the values corresponding to groups in another column 3) the values corresponding to binning another column Parameters ---------- df : dd.DataFrame dask dataframe with one or two columns bins : int number of bins to use if df has two numerical columns ngroups : int number of groups to show if df has a categorical and numerical column largest: bool when calculating a box plot per group, select the largest or smallest groups Returns ------- Tuple[pd.DataFrame, List[str], List[float], Dict[str, int]] The box plot statistics in a dataframe, a list of the outlier groups and another list of the outlier values, a dictionary logging the sampled group output """ # pylint: disable=too-many-locals grp_cnt_stats = None # to inform the user of sampled output x = df.columns[0] if len(df.columns) == 1: df = _calc_box_stats(df[x], x) else: y = df.columns[1] if is_numerical(df[x].dtype) and is_numerical(df[y].dtype): minv, maxv, cnt = dask.compute(df[x].min(), df[x].max(), df[x].nunique()) if cnt < bins: bins = cnt - 1 endpts = np.linspace(minv, maxv, num=bins + 1) # calculate a box plot over each bin df = dd.concat( [ _calc_box_stats( df[(df[x] >= endpts[i]) & (df[x] < endpts[i + 1])][y], f"[{endpts[i]},{endpts[i+1]})", ) if i != len(endpts) - 2 else _calc_box_stats( df[(df[x] >= endpts[i]) & (df[x] <= endpts[i + 1])][y], f"[{endpts[i]},{endpts[i+1]}]", ) for i in range(len(endpts) - 1) ], axis=1, ).compute() else: df, grp_cnt_stats, largest_grps = _calc_groups( df, ngroups, largest) # calculate a box plot over each group df = dd.concat( [ _calc_box_stats(df[df[x] == grp][y], grp) for grp in largest_grps ], axis=1, ).compute() df = df.append( pd.Series( {c: i + 1 for i, c in enumerate(df.columns)}, name="x", )).T df.index.name = "grp" df = df.reset_index() df["x0"], df[ "x1"] = df["x"] - 0.8, df["x"] - 0.2 # width of whiskers for plotting outx: List[str] = [] # list for the outlier groups outy: List[float] = [] # list for the outlier values for ind in df.index: otlrs = df.loc[ind]["otlrs"] outx = outx + [df.loc[ind]["grp"]] * len(otlrs) outy = outy + otlrs return df, outx, outy, grp_cnt_stats
def missing_impact_1vn( # pylint: disable=too-many-locals df: dd.DataFrame, x: str, bins: int) -> Intermediate: """ Calculate the distribution change on other columns when the missing values in x is dropped. """ df0 = df df1 = df.dropna(subset=[x]) cols = [col for col in df.columns if col != x] hists = {} for col in cols: range = None # pylint: disable=redefined-builtin if is_numerical(df0[col].dtype): range = (df0[col].min(axis=0), df0[col].max(axis=0)) hists[col] = [ histogram(df[col], bins=bins, return_edges=True, range=range) for df in [df0, df1] ] (hists, ) = dd.compute(hists) dfs = {} meta = ColumnsMetadata() for col, hists_ in hists.items(): counts, xs, *edges = zip(*hists_) labels = np.repeat(LABELS, [len(x) for x in xs]) data = { "x": np.concatenate(xs), "count": np.concatenate(counts), "label": labels, } if edges: lower_bound: List[float] = [] upper_bound: List[float] = [] for edge in edges[0]: lower_bound.extend(edge[:-1]) upper_bound.extend(edge[1:]) data["lower_bound"] = lower_bound data["upper_bound"] = upper_bound df = pd.DataFrame(data) # If the cardinality of a categorical column is too large, # we show the top `num_bins` values, sorted by their count before drop if len(counts[0]) > bins and is_categorical(df0[col].dtype): sortidx = np.argsort(-counts[0]) selected_xs = xs[0][sortidx[:bins]] df = df[df["x"].isin(selected_xs)] meta[col, "partial"] = (bins, len(counts[0])) else: meta[col, "partial"] = (len(counts[0]), len(counts[0])) meta[col, "dtype"] = df0[col].dtype dfs[col] = df return Intermediate(data=dfs, x=x, meta=meta, visual_type="missing_impact_1vn")
def write_to_S3(data: dd.DataFrame, bucket_name: str, folder_name: str, table_name: str) -> None: data.to_parquet(path=f"s3://{bucket_name}/{folder_name}/{table_name}", compression="gzip", engine="pyarrow", overwrite=True)
def _apply_offset(self, df: dd.DataFrame, offset: int, end: int) -> dd.DataFrame: """ Limit the dataframe to the window [offset, end]. That is unfortunately, not so simple as we do not know how many items we have in each partition. We have therefore no other way than to calculate (!!!) the sizes of each partition. After that, we can create a new dataframe from the old dataframe by calculating for each partition if and how much it should be used. We do this via generating our own dask computation graph as we need to pass the partition number to the selection function, which is not possible with normal "map_partitions". """ if not offset: # We do a (hopefully) very quick check: if the first partition # is already enough, we will just ust this first_partition_length = len(df.partitions[0]) if first_partition_length >= end: return df.head(end, compute=False) # First, we need to find out which partitions we want to use. # Therefore we count the total number of entries partition_borders = df.map_partitions(lambda x: len(x)) # Now we let each of the partitions figure out, how much it needs to return # using these partition borders # For this, we generate out own dask computation graph (as it does not really # fit well with one of the already present methods). # (a) we define a method to be calculated on each partition # This method returns the part of the partition, which falls between [offset, fetch] # Please note that the dask object "partition_borders", will be turned into # its pandas representation at this point and we can calculate the cumsum # (which is not possible on the dask object). Recalculating it should not cost # us much, as we assume the number of partitions is rather small. @dask.delayed def select_from_to(df, partition_index, partition_borders): partition_borders = partition_borders.cumsum().to_dict() this_partition_border_left = ( partition_borders[partition_index - 1] if partition_index > 0 else 0 ) this_partition_border_right = partition_borders[partition_index] if (end and end < this_partition_border_left) or ( offset and offset >= this_partition_border_right ): return df.iloc[0:0] from_index = max(offset - this_partition_border_left, 0) if offset else 0 to_index = ( min(end, this_partition_border_right) if end else this_partition_border_right ) - this_partition_border_left return df.iloc[from_index:to_index] # (b) Now we just need to apply the function on every partition # We do this via the delayed interface, which seems the easiest one. return dd.from_delayed( [ select_from_to(partition, partition_number, partition_borders) for partition_number, partition in enumerate(df.partitions) ] )
def _save(self, data: dd.DataFrame) -> None: data.to_parquet(self._filepath, storage_options=self.fs_args, **self._save_args)
def slowly_create_increasing_index(ddf: dd.DataFrame) -> dd.DataFrame: ddf['cs'] = 1 ddf['cs'] = ddf.cs.cumsum() return ddf.set_index('cs')
def missing_impact_1vn( # pylint: disable=too-many-locals df: dd.DataFrame, x: str, bins: int, dtype: Optional[DTypeDef] = None, ) -> Intermediate: """ Calculate the distribution change on other columns when the missing values in x is dropped. """ df0 = df df1 = df.dropna(subset=[x]) cols = [col for col in df.columns if col != x] hists = {} hists_restore_dtype = {} for col in cols: range = None # pylint: disable=redefined-builtin if is_dtype(detect_dtype(df0[col], dtype), Continuous()): range = (df0[col].min(axis=0), df0[col].max(axis=0)) hists[col] = [ histogram(df[col], dtype=dtype, bins=bins, return_edges=True, range=range) for df in [df0, df1] ] # In some cases(Issue#98), dd.compute() can change the features dtypes and cause error. # So we need to restore features dtypes after dd.compute(). centers_dtypes = (hists[col][0][1].dtype, hists[col][1][1].dtype) (hists,) = dd.compute(hists) dict_value = [] # Here we do not reassign to the "hists" variable as # dd.compute() can change variables' types and cause error to mypy test in CircleCI . # Instead, we assign to a new variable hists_restore_dtype. for i in [0, 1]: intermediate = list(hists[col][i]) intermediate[1] = intermediate[1].astype(centers_dtypes[i]) dict_value.append(tuple(intermediate)) hists_restore_dtype[col] = dict_value dfs = {} meta = ColumnsMetadata() for col, hists_ in hists_restore_dtype.items(): counts, xs, *edges = zip(*hists_) labels = np.repeat(LABELS, [len(x) for x in xs]) data = { "x": np.concatenate(xs), "count": np.concatenate(counts), "label": labels, } if edges: lower_bound: List[float] = [] upper_bound: List[float] = [] for edge in edges[0]: lower_bound.extend(edge[:-1]) upper_bound.extend(edge[1:]) data["lower_bound"] = lower_bound data["upper_bound"] = upper_bound df = pd.DataFrame(data) # If the cardinality of a categorical column is too large, # we show the top `num_bins` values, sorted by their count before drop if len(counts[0]) > bins and is_dtype(detect_dtype(df0[col], dtype), Nominal()): sortidx = np.argsort(-counts[0]) selected_xs = xs[0][sortidx[:bins]] df = df[df["x"].isin(selected_xs)] meta[col, "partial"] = (bins, len(counts[0])) else: meta[col, "partial"] = (len(counts[0]), len(counts[0])) meta[col, "dtype"] = detect_dtype(df0[col], dtype) dfs[col] = df return Intermediate(data=dfs, x=x, meta=meta, visual_type="missing_impact_1vn")
def compute_bivariate( df: dd.DataFrame, x: str, y: str, bins: int, ngroups: int, largest: bool, nsubgroups: int, timeunit: str, agg: str, sample_size: int, dtype: Optional[DTypeDef] = None, ) -> Intermediate: """Compute functions for plot(df, x, y). Parameters ---------- df Dataframe from which plots are to be generated x A valid column name from the dataframe y A valid column name from the dataframe bins For a histogram or box plot with numerical x axis, it defines the number of equal-width bins to use when grouping. ngroups When grouping over a categorical column, it defines the number of groups to show in the plot. Ie, the number of bars to show in a bar chart. largest If true, when grouping over a categorical column, the groups with the largest count will be output. If false, the groups with the smallest count will be output. nsubgroups If x and y are categorical columns, ngroups refers to how many groups to show from column x, and nsubgroups refers to how many subgroups to show from column y in each group in column x. timeunit Defines the time unit to group values over for a datetime column. It can be "year", "quarter", "month", "week", "day", "hour", "minute", "second". With default value "auto", it will use the time unit such that the resulting number of groups is closest to 15. agg Specify the aggregate to use when aggregating over a numeric column sample_size Sample size for the scatter plot dtype: str or DType or dict of str or dict of DType, default None Specify Data Types for designated column or all columns. E.g. dtype = {"a": Continuous, "b": "Nominal"} or dtype = {"a": Continuous(), "b": "nominal"} or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() """ # pylint: disable=too-many-arguments,too-many-locals xtype = detect_dtype(df[x], dtype) ytype = detect_dtype(df[y], dtype) if (is_dtype(xtype, Nominal()) and is_dtype(ytype, Continuous()) or is_dtype(xtype, Continuous()) and is_dtype(ytype, Nominal())): x, y = (x, y) if is_dtype(xtype, Nominal()) else (y, x) df = df[[x, y]] first_rows = df.head() try: first_rows[x].apply(hash) except TypeError: df[x] = df[x].astype(str) (comps, ) = dask.compute( nom_cont_comps(df.dropna(), bins, ngroups, largest)) return Intermediate(x=x, y=y, data=comps, ngroups=ngroups, visual_type="cat_and_num_cols") elif (is_dtype(xtype, DateTime()) and is_dtype(ytype, Continuous()) or is_dtype(xtype, Continuous()) and is_dtype(ytype, DateTime())): x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x) df = df[[x, y]].dropna() dtnum: List[Any] = [] # line chart dtnum.append(dask.delayed(_calc_line_dt)(df, timeunit, agg)) # box plot dtnum.append(dask.delayed(calc_box_dt)(df, timeunit)) dtnum = dask.compute(*dtnum) return Intermediate( x=x, y=y, linedata=dtnum[0], boxdata=dtnum[1], visual_type="dt_and_num_cols", ) elif (is_dtype(xtype, DateTime()) and is_dtype(ytype, Nominal()) or is_dtype(xtype, Nominal()) and is_dtype(ytype, DateTime())): x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x) df = df[[x, y]].dropna() df[y] = df[y].apply(str, meta=(y, str)) dtcat: List[Any] = [] # line chart dtcat.append( dask.delayed(_calc_line_dt)(df, timeunit, ngroups=ngroups, largest=largest)) # stacked bar chart dtcat.append( dask.delayed(calc_stacked_dt)(df, timeunit, ngroups, largest)) dtcat = dask.compute(*dtcat) return Intermediate( x=x, y=y, linedata=dtcat[0], stackdata=dtcat[1], visual_type="dt_and_cat_cols", ) elif is_dtype(xtype, Nominal()) and is_dtype(ytype, Nominal()): df = df[[x, y]] first_rows = df.head() try: first_rows[x].apply(hash) except TypeError: df[x] = df[x].astype(str) try: first_rows[y].apply(hash) except TypeError: df[y] = df[y].astype(str) (comps, ) = dask.compute(df.dropna().groupby([x, y]).size()) return Intermediate( x=x, y=y, data=comps, ngroups=ngroups, nsubgroups=nsubgroups, visual_type="two_cat_cols", ) elif is_dtype(xtype, Continuous()) and is_dtype(ytype, Continuous()): df = df[[x, y]].dropna() data: Dict[str, Any] = {} # scatter plot data data["scat"] = df.map_partitions( lambda x: x.sample(min(100, x.shape[0])), meta=df) # hexbin plot data data["hex"] = df # box plot data["box"] = calc_box_num(df, bins) (data, ) = dask.compute(data) return Intermediate( x=x, y=y, data=data, spl_sz=sample_size, visual_type="two_num_cols", ) else: raise UnreachableError
def calc_line_dt( df: dd.DataFrame, unit: str, agg: Optional[str] = None, ngroups: Optional[int] = None, largest: Optional[bool] = None, ) -> Union[Tuple[pd.DataFrame, Dict[str, int], str], Tuple[ pd.DataFrame, str, float], Tuple[pd.DataFrame, str], ]: """ Calculate a line or multiline chart with date on the x axis. If df contains one datetime column, it will make a line chart of the frequency of values. If df contains a datetime and categorical column, it will compute the frequency of each categorical value in each time group. If df contains a datetime and numerical column, it will compute the aggregate of the numerical column grouped by the time groups. If df contains a datetime, categorical, and numerical column, it will compute the aggregate of the numerical column for values in the categorical column grouped by time. Parameters ---------- df A dataframe unit The unit of time over which to group the values agg Aggregate to use for the numerical column ngroups Number of groups for the categorical column largest Use the largest or smallest groups in the categorical column """ # pylint: disable=too-many-locals x = df.columns[0] # time column unit = _get_timeunit(df[x].min(), df[x].max(), 100) if unit == "auto" else unit if unit not in DTMAP.keys(): raise ValueError grouper = pd.Grouper(key=x, freq=DTMAP[unit][0]) # for grouping the time values # multiline charts if ngroups and largest: hist_dict: Dict[str, Tuple[np.ndarray, np.ndarray, List[str]]] = dict() hist_lst: List[Tuple[np.ndarray, np.ndarray, List[str]]] = list() agg = ("freq" if agg is None else agg ) # default agg if unspecified for notational concision # categorical column for grouping over, each resulting group is a line in the chart grpby_col = df.columns[1] if len(df.columns) == 2 else df.columns[2] df, grp_cnt_stats, largest_grps = _calc_groups(df, grpby_col, ngroups, largest) groups = df.groupby([grpby_col]) for grp in largest_grps: srs = groups.get_group(grp) # calculate the frequencies or aggregate value in each time group if len(df.columns) == 3: dfr = srs.groupby(grouper)[df.columns[1]].agg( agg).reset_index() else: dfr = srs[x].to_frame().groupby(grouper).size().reset_index() dfr.columns = [x, agg] # if grouping by week, make the label for the week the beginning Sunday dfr[x] = dfr[x] - pd.to_timedelta( 6, unit="d") if unit == "week" else dfr[x] # format the label dfr["lbl"] = dfr[x].dt.to_period("S").dt.strftime(DTMAP[unit][1]) hist_lst.append((list(dfr[agg]), list(dfr[x]), list(dfr["lbl"]))) hist_lst = dask.compute(*hist_lst) for elem in zip(largest_grps, hist_lst): hist_dict[elem[0]] = elem[1] return hist_dict, grp_cnt_stats, DTMAP[unit][3] # single line charts if agg is None: # frequency of datetime column miss_pct = round(df[x].isna().sum() / len(df) * 100, 1) dfr = df.dropna().groupby(grouper).size().reset_index() dfr.columns = [x, "freq"] dfr["pct"] = dfr["freq"] / len(df) * 100 else: # aggregate over a second column dfr = df.groupby(grouper)[df.columns[1]].agg(agg).reset_index() dfr.columns = [x, agg] dfr[x] = dfr[x] - pd.to_timedelta(6, unit="d") if unit == "week" else dfr[x] dfr["lbl"] = dfr[x].dt.to_period("S").dt.strftime(DTMAP[unit][1]) return (dfr, DTMAP[unit][3], miss_pct) if agg is None else (dfr, DTMAP[unit][3])
def dump_dask_to_intake(dd: DataFrame, data_name: str, data_dir: Union[str, Path], catalog_file: Union[Path, str], **kwargs): """ API to dump dask dataframe as parquet format and add it to intake catalog Args: dd: dask dataframe to dump. data_name: name used as a name of intake data source. data_dir: directory where dask dataframe will be stored. catalog_file: file where data source to be added. if file doesn't exist, file will be created. kwargs: Any options available for dask.dataframe.to_parquet. see https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet for detail. Returns: created parquet data source and dask to_parquet job (if you put compute=False in kwargs.) Examples: >>> import os >>> import shutil >>> import yaml >>> from intake.source.csv import CSVSource >>> import dask.dataframe >>> import pandas >>> pd = pandas.DataFrame({'a': [1, 2, 3, 4], 'b': [2, 3, 4, 5], 'c': [5, 6, 7, 8], ... 'label': ['a', 'b', 'c', 'd']}, ... index=[100, 200, 300, 400]) >>> print(pd) a b c label 100 1 2 5 a 200 2 3 6 b 300 3 4 7 c 400 4 5 8 d >>> dd = dask.dataframe.from_pandas(pd, npartitions=2) >>> print(dd) ... # doctest: +NORMALIZE_WHITESPACE Dask DataFrame Structure: a b c label npartitions=2 100 int64 int64 int64 object 300 ... ... ... ... 400 ... ... ... ... Dask Name: from_pandas, 2 tasks >>> cfile = 'test/temp/test-catalog.yaml' >>> ddir = 'test/temp/data-dir' >>> # DUMP WITH COMPUTATION >>> psource1, job = dump_dask_to_intake(dd, 'test-dd1', ddir, cfile) >>> print(psource1.name) test-dd1 >>> print(psource1.read()) ... # doctest: +NORMALIZE_WHITESPACE a b c label 100 1 2 5 a 200 2 3 6 b 300 3 4 7 c 400 4 5 8 d >>> print(job is None) True >>> print(yaml.safe_load(Path(cfile).open().read())) ... # doctest: +NORMALIZE_WHITESPACE {'metadata': {}, 'sources': {'test-dd1': {'args': {'urlpath': 'test/temp/data-dir/test-dd1'}, 'description': '', 'driver': 'intake_parquet.source.ParquetSource', 'metadata': {}}}} >>> # DUMP WITHOUT COMPUTATION >>> psource2, job = dump_dask_to_intake(dd, 'test-dd2', ddir, cfile, compute=False) >>> print(job is None) False >>> # do computation lazily >>> job.compute() >>> print(psource2.read()) ... # doctest: +NORMALIZE_WHITESPACE a b c label 100 1 2 5 a 200 2 3 6 b 300 3 4 7 c 400 4 5 8 d >>> os.remove(cfile) >>> shutil.rmtree(ddir) """ data_dir = local_or_s3_path(data_dir) parquet_dir = data_dir / data_name parquet_kwargs = {} if 'engine' in kwargs: parquet_kwargs['engine'] = kwargs['engine'] psource = ParquetSource(str(parquet_dir), **parquet_kwargs) psource.name = data_name add_source_to_catalog(psource, catalog_file) to_parquet_result = dd.to_parquet(str(parquet_dir), **kwargs) return psource, to_parquet_result