def rolling_mean_by_date_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None, window: int = None) -> dd: """ Split input dateframe into groups and preform a rolling average on the metric columns for each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :param window: window size to be used on rolling average :return: modified dask dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_rolling_mean'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: rolling_mean_by_date( data=df_g, metric_columns=metric_columns, window=window), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data
def fill_missing_dates_by_group(data: dd = None, groupby_columns: List[str] = None, fill_method: str = None, date_range: Tuple[str] = None, date_column: str = None, fill_value=None) -> dd: """ split input dataframe into groups according to groupby columns and reindex with continuous dates with specified date range. Fill missing values according to fill method :param data: dataframe :param groupby_columns: list of columns to groupby :param fill_method: method used to fill missing data :param date_range: date range to reidex to :param date_column: name of date column :return: modified dataframe """ output_schema = dict(data.dtypes) output_schema = list(output_schema.items()) columns = data.columns data = data.set_index(date_column, sorted=True) data = data.groupby(by=groupby_columns).apply( lambda df_g: fill_missing_dates(data=df_g, date_column=date_column, fill_method=fill_method, columns=columns, date_range=date_range, fill_value=fill_value, groupby_columns=groupby_columns), meta=output_schema).reset_index(drop=True) return data
def fuzzy_join(orders: dd, price_over_time: dd, on: str) -> dd: orders.loc[:, 'price'] = pd.to_numeric(orders['price']) orders.loc[:, 'time'] = pd.to_datetime(orders['time']) price_over_time = price_over_time.reindex(orders['time'].unique(), method='nearest') joined = orders.join(price_over_time, on=on).fillna(method='ffill') joined['relative_price'] = joined.apply(lambda row: float(row[ 'price']) - float(row['most_recent_trade_price']), axis=1) return joined
def remove_tails(self, data: dd, std_devs: int, sample_size: int = 10000): data = DataUtils().keep_n_std_dev(data, std_devs) if len(data) > sample_size: data = data.sample(n=sample_size) data = DataUtils().keep_n_std_dev(data, std_devs) return data
def fit(self, X: dd, y=None): """ Calculate what columns should be removed, based on the defined thresholds Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: None """ # Calculate number of missing rows in each column summary_df = X.isnull().sum().compute() summary_df = summary_df.to_frame(name="nulls_count") summary_df["nulls_proportions"] = summary_df["nulls_count"] / X.shape[ 0].compute() summary_df.sort_values(by="nulls_count", ascending=False, inplace=True) # Select what columns should be removed, based on proportions mask_nulls = summary_df["nulls_proportions"] > self.nulls_threshold summary_df.loc[mask_nulls, "filtered_nulls"] = 1 summary_df.loc[~mask_nulls, "filtered_nulls"] = 0 self.feature_names = list(summary_df[mask_nulls].index.values) return self
def make_filter_std_pipeline(data: dd, numerical_columns: list[str] or bool = True, thresholds: list[float] = None, inclusive: bool = False): #TODO: write unit tests """ Makes pipeline to filter columns according to standard deviation Args: data (dd): Data frame to be filtered numerical_columns (list or bool, optional): Columns to subset the filtering. Defaults to True. thresholds (list, optional): Interval of std values to filter. Defaults to None. inclusive (bool, optional): Includes or not the interval boundaries. Defaults to False. Returns: EPipeline: Pipeline to filter data frame """ selected_columns = data.select_dtypes( include=[np.number]).columns.values if isinstance( numerical_columns, bool) else numerical_columns steps = [("extract", Extract(selected_columns)), ("std_filter", Filter_Std(std_thresholds=thresholds, inclusive=inclusive))] return EPipeline(steps)
def drop_rows_with_any_null_values(data: dd = None) -> dd: """ drop and rows containing null values from the input dataframe :param data: dask dataframe :return: modified dask dataframe """ return data.dropna()
def fit(self, X: dd, y=None): """Calculate what columns should be removed, based on the defined thresholds Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: None """ subset = X.select_dtypes(exclude=[np.number, "datetime64[ns]"]) # Calculate the entropy column-wisely entropies_df = subset.compute().apply(entropy, axis=0).to_frame(name="entropy") entropies_df.reset_index(inplace=True) entropies_df.rename(columns={"index": "column_name"}, inplace=True) entropies_df.sort_values(by="entropy", inplace=True, ascending=False) # Get thresholds and calculate what columns will be removed thresholds = [float(value) for value in self.entropy_thresholds] mask_entropy = entropies_df["entropy"].between( min(thresholds), max(thresholds), inclusive=self.inclusive) # Get list of columns to be removed self.feature_names = list(entropies_df.loc[~mask_entropy, "column_name"].values) mask_removed = entropies_df["column_name"].isin(self.feature_names) entropies_df.loc[mask_removed, "filtered_entropy"] = 1 return self
def get_orderbook(feed_df: pd.DataFrame, ob_state: dd, ob_state_seq) -> dd: """Gets those orders which are still active at the end of the feed""" # Find those orders which are no longer on the book # TODO: find those orders which were modified, handle carefully open_messages = feed_df[feed_df['type'] == 'open'] open_messages['size'] = open_messages['remaining_size'] residual_orders = open_messages[ open_messages['sequence'] > ob_state_seq] all_orders = ob_state.append(residual_orders) done_messages = feed_df[feed_df['type'] == 'done'] done_order_ids = list(done_messages['order_id']) # Find those orders which are still on the book ob_filtered = all_orders[~all_orders['order_id'].isin(done_order_ids)] # This variable is used in the pandas query below # final_trade_price = trades['price'].dropna().iloc[-1] # ob_final = DataSplitter.get_side("buy", ob_filtered).query('price < @final_trade_price').append( # DataSplitter.get_side("sell", ob_filtered).query('price > @final_trade_price') # ) if not OrderBookCreator.check_ob_valid(ob_filtered): raise AssertionError("OrderBook does not appear to be valid") final_seq = ob_filtered['sequence'].sort_values().iloc[-1] return ob_filtered.reset_index(drop=True)[[ 'side', 'order_id', 'price', 'size' ]], final_seq
def encode_dataset_into_binning_indices(dd: dict, data: df, bn_attrs: [], cat_attrs: []): """Before constructing Bayesian network, encode input dataset into binning indices.""" data_enconded = data.to_delayed() data_enconded = [ dask.delayed(attributes.encode_chunk_into_binning_indices)( chunk, bn_attrs, cat_attrs, dd['distribution']['bins']) for chunk in data_enconded ] return data_enconded
def agg_insert_by_group(data: dd = None, groupby_columns: List[str] = None, agg_dict: dict = None, insert_dict: dict = None) -> dd: """ Split input dataframe into groups, apply aggregations on each group according to the aggregation dict, insert aggregated results back into the original dataframe with column values specified in insert dict :param data: input dask dataframe :param groupby_columns: list of column names to group by :param agg_dict: dictionary of the format {column name: aggregation to preform to column name} :param insert_dict: dictionary of the format {column name: value of column to be set prior to insertion} :return: modified datafraeme """ agg_data = data.groupby(groupby_columns).agg(agg_dict).reset_index() agg_data.columns = agg_data.columns.droplevel(1) for column, value in insert_dict.items(): agg_data[column] = 'COMBINED' data = data.append(agg_data) return data
def aggr_by_year_journal(df: dask.dataframe) -> dask.dataframe: """Aggregate issue count by year and newspaper. :param dask.dataframe df: Dataframe comprising all issues. :return: Dataframe grouped by year and source . :rtype: dask.dataframe """ return df.groupby(['journal', 'year']).count()
def drop_duplicate_rows(data: dd = None, subset: List[str] = None, keep: str = None) -> dd: """ Drop rows containing duplicate data for the specified subset of columns :param data: dask dataframe :param subset: list of column names :param keep: which duplicate to keep :return: modified dask dataframe """ return data.drop_duplicates(subset=subset, keep=keep)
def transform(self, X: dd, y=None): """ Remove duplicated rows Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: (dd): Dataframe with rows removed """ return X.drop_duplicates(subset=self.subset)
def transform(self, X: dd, y=None): """ Remove columns computed in fit method Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: (dd): Dataframe with columns removed """ return X.drop(labels=self.feature_names, axis=1)
def remove_outliers(self, data: dataframe, threshold: float): data = data.compute(num_workers=self.workers) stats: dict = { "mean": data[self.cols["CONTINUOUS"]].mean(axis=0), "std_dev": data[self.cols["CONTINUOUS"]].std(axis=0) } z_cols = list(map(lambda col: "z" + col, self.cols["CONTINUOUS"])) zdata = data[self.cols["CONTINUOUS"]].apply( lambda col: (col - stats["mean"][col.name]) / (stats["std_dev"][col.name]), axis=0) zdata.columns = z_cols data = concat([data, zdata], axis=1) for z_col in z_cols: data = data[data[z_col].between(-1 * threshold, threshold)] return dataframe.from_pandas( data.drop(columns=z_cols).reset_index(drop=True), npartitions=self.workers)
def yoy_percent_change_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None) -> dd: """ Split dataframe into groups and calculate year over year percent change for the etric columns in each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :return: modified dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_yoy_pct_change'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: yoy_percent_change(data=df_g, metric_columns=metric_columns), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data
def make_filter_entropy_pipeline(data: dd, categorical_columns: list[str] or bool = True, thresholds: list[float] = None, inclusive: bool = False): #TODO: write unit tests selected_columns = data.select_dtypes( exclude=[np.number], include=["object"]) if isinstance( categorical_columns, bool) else categorical_columns steps = [("extract", Extract(selected_columns)), ("entropy_filter", Filter_Entropy(entropy_thresholds=thresholds, inclusive=inclusive))] return EPipeline(steps)
def date_continuity_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_column: str = None) -> bool: """ Split data into groups and evaluate each group checking if it contains a set of continuous dates in its date column. If any group contains a discontinuity return true else return false :param data: dask dataframe :param groupby_columns: column names to groupby :param date_column: date column name :return: boolean """ output_schema = [(date_column, data[date_column].dtype)] output_schema.append(('date_continuity_bool', 'bool')) data = data.groupby(by=groupby_columns).apply( lambda df_g: date_continuity_check(data=df_g, date_column=date_column), meta=output_schema).reset_index() return data['date_continuity_bool'].compute().any()
def date_range_check_by_group(data: dd = None, groupby_columns: List[str] = None, date_range: Tuple[str] = None, date_column: str = None) -> bool: """ Split input dataframe by group and check if the min and max date of each group falls outside of specified date range :param data: dask dataframe :param groupby_columns: list of column names to group by :param date_range: tuple defining required date range :param date_column: name of date column :return: bool """ output_schema = ('date_range_bool', 'bool') data = data.groupby(by=groupby_columns).apply( lambda df_g: date_range_check(data=df_g, date_range=date_range, date_column=date_column), meta=output_schema).reset_index() return data['date_range_bool'].compute().any()
def impute_nulls(self, data: dataframe): # Impute by mean data[self.cols["CONTINUOUS"]] = data[self.cols["CONTINUOUS"]].fillna( data[self.cols["CONTINUOUS"]].mean( axis=0, skipna=True).compute(num_workers=self.workers), axis=0) # Impute by mode cat_cols: list = self.cols["CATEGORICAL"]["STRING"] +\ self.cols["CATEGORICAL"]["NUMERIC"] col_modes = data[cat_cols].mode(dropna=True).compute( num_workers=self.workers) for col in cat_cols: data[col] = data[col].fillna(col_modes[col].iloc[0], axis=0) data = data.dropna(how="any") return data
def write_data_by_file_extension(data: dd = None, file_path: Path = None): """ write dask dataframe to file into based on the input path file extension :param file_path: path of output file :return: None """ data = data.compute() map_file_extension_to_read_function = { '.csv': 'to_csv', '.parquet': 'to_parquet' } name, extension = os.path.splitext(file_path) if extension.lower() in map_file_extension_to_read_function.keys(): write_function = getattr( data, map_file_extension_to_read_function[extension.lower()]) read_function = map_file_extension_to_read_function[extension.lower()] write_function(file_path, index=False) else: raise Exception(f"File extention {extension} not recognized")
def fill_missing_dates(data: dd = None, date_column: str = None, fill_method: str = None, columns=None, date_range: Tuple[str] = None, fill_value=None, groupby_columns=None) -> dd: """ Preform date fill on single group """ all_dates = pd.date_range(date_range[0], date_range[1]) metric_data = data[[ col for col in data.columns if col not in groupby_columns ]] data = data[groupby_columns].reindex(all_dates, method='nearest') metric_data = metric_data.reindex(all_dates, method=fill_method, fill_value=fill_value) data = dd.merge(data, metric_data, left_index=True, right_index=True) data = data.reset_index().rename(columns={'index': date_column})[columns] return data
def upload_parquet_file_to_es_idx(es: Elasticsearch, es_idx: str, papers_dd: dd, partition_num: int) -> None: try: start = time.time() papers_dd_partition = papers_dd.get_partition(partition_num) papers_df_partition = papers_dd_partition.compute() compute_end = time.time() print( f"Partition #{partition_num} compute time: {compute_end - start}") print( f"Papers partition memory size: {papers_df_partition.memory_usage(deep=True).sum()}" ) print(f"Number of papers in partition: {papers_df_partition.shape[0]}") r = es.bulk(rec_to_actions(papers_df_partition, es_idx)) upload_end = time.time() print( f"Partition #{partition_num} upload time: {upload_end - compute_end}" ) print( f"Errors in uploading partition #{partition_num}: {r['errors']}\n\n" ) except TransportError as te: transport_error_413_url = "https://github.com/elastic/elasticsearch/issues/2902" transport_error_429_urls = [ "https://stackoverflow.com/questions/61870751/circuit-breaking-exception-parent-data-too-large-data-for-http-request", "https://github.com/elastic/elasticsearch/issues/31197", ] if te.status_code == 413: print( f"Transport error with status code 413. Chunk size is too large, so try reducing chunk size constant or increase http.max_content_length in the yml file. More info here: {transport_error_413_url}" ) elif te.status_code == 429: print( f"Transport error with status code 429. Elasticsearch's JVM heap size is too small, so try increasing ES_HEAP_SIZE env var in docker-compose.yml. More info here: {transport_error_429_urls}" ) else: # Could be ConnectionTimeout in connecting to index print(f"Error stacktrace: {te.error, te.info}") raise te
def apply(self, df: dd, scheduler: Scheduler = "processes") -> np.ndarray: """Label Dask DataFrame of data points with LFs. Parameters ---------- df Dask DataFrame containing data points to be labeled by LFs scheduler A Dask scheduling configuration: either a string option or a ``Client``. For more information, see https://docs.dask.org/en/stable/scheduling.html# Returns ------- np.ndarray Matrix of labels emitted by LFs """ apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs) map_fn = df.map_partitions(lambda p_df: p_df.apply(apply_fn, axis=1)) labels = map_fn.compute(scheduler=scheduler) labels_with_index = rows_to_triplets(labels) return self._numpy_from_row_data(labels_with_index)
def fit(self, X: dd, y=None): """Calculate what columns should be removed, based on the defined thresholds Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: None """ subset = X.select_dtypes(include=[np.number]) # Calculate the standad deviation column-wisely stds = np.nanstd(subset, axis=0) stds_df = pd.DataFrame.from_dict({ "column_name": subset.columns.values, "std": stds }) stds_df.sort_values(by="std", inplace=True, ascending=False) # Get thresholds and calculate what columns will be removed thresholds = [float(value) for value in self.std_thresholds] mask_variance = stds_df["std"].between(min(thresholds), max(thresholds), inclusive=self.inclusive) # Get list of columns to be removed self.feature_names = list(stds_df.loc[~mask_variance, "column_name"].values) mask_removed = stds_df["column_name"].isin(self.feature_names) stds_df.loc[mask_removed, "filtered_variance"] = 1 stds_df.loc[~mask_removed, "filtered_variance"] = 0 return self
def fill_in_missing_data(dask_df: dd) -> None: return dask_df.fillna("")
def remove_papers_with_null_cols(dask_df: dd, cols: List[str]) -> None: return dask_df.dropna(subset=cols, how="all")
def gather_papers_data(metadata_dd: dd) -> dd: return metadata_dd.map_partitions(lambda df: df.assign( body=retrieve_paper_body_text_for_series(df.pdf_json_files)))
def preprocess_dataset(ddf: dataframe) -> dataframe: """Preprocesses a dataFrame: - constant missing value replacement - lower case - strip accentuated characters - extract year from title and simplifies it to avoid redundancy - Stop words removal and stemming Parameters ---------- ddf: str the dataframe to be processed. Returns ------- dataframe """ text_cols = [ 'country', 'designation', 'province', 'region_1', 'region_2', 'taster_name', 'variety', 'winery' ] ddf = ddf.map_partitions(lambda d: d.assign(country=d['country'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(designation=d[ 'designation'].fillna("_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(province=d['province'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(region_1=d['region_1'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(region_2=d['region_2'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(taster_name=d[ 'taster_name'].fillna("_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(variety=d['variety'].fillna( "_missing_").str.lower().apply(unidecode))) ddf = ddf.map_partitions(lambda d: d.assign(winery=d['winery'].fillna( "_missing_").str.lower().apply(unidecode))) # Get year from the title ddf = ddf.map_partitions(lambda d: d.assign(year=d['title'].str.extract( '(\d{4,})', expand=False).astype(float))) # Remove year and geographical info from the tilte. They are in already other columns. ddf = ddf.map_partitions(lambda d: d.assign(title=d[ 'title'].fillna("_missing_").str.lower().apply(unidecode).str.replace( '(\d+ )', '').str.replace('\((.+)\)\s*$', '').str.replace( '\s{2,}', ' ').fillna("_missing_"))) ddf = ddf.map_partitions(lambda d: d.assign(description=d[ 'description'].fillna("_missing_").str.lower().apply(stem_description). fillna("_missing_"))) return ddf