def transform(self, columns: ColumnNames, gdf: cudf.DataFrame) -> cudf.DataFrame: new_gdf = cudf.DataFrame() tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") cat_names, multi_col_group = nvt_cat._get_multicolumn_names( columns, gdf.columns, self.name_sep) for name in cat_names: storage_name = self.storage_name.get(name, name) name = multi_col_group.get(name, name) path = self.categories[storage_name] selection_l = list(name) if isinstance(name, tuple) else [name] selection_r = list(name) if isinstance(name, tuple) else [storage_name] stat_gdf = nvt_cat._read_groupby_stat_df(path, storage_name, self.cat_cache) tran_gdf = gdf[selection_l + [tmp]].merge(stat_gdf, left_on=selection_l, right_on=selection_r, how="left") tran_gdf = tran_gdf.sort_values(tmp) tran_gdf.drop(columns=selection_l + [tmp], inplace=True) new_cols = [ c for c in tran_gdf.columns if c not in new_gdf.columns ] new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True) gdf.drop(columns=[tmp], inplace=True) return new_gdf
def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): # Add temporary column for sorting tmp = "__tmp__" gdf[tmp] = cupy.arange(len(gdf), dtype="int32") # Only perform "fit" if fold column is present fit_folds = "__fold__" in gdf.columns # Need mean of contiuous target column y_mean = self.target_mean or stats_context["means"] # Loop over categorical-column groups and apply logic new_gdf = None for ind, cat_group in enumerate(self.cat_groups): if new_gdf is None: new_gdf = self._op_group_logic(cat_group, gdf, stats_context, y_mean, fit_folds, ind) else: _df = self._op_group_logic(cat_group, gdf, stats_context, y_mean, fit_folds, ind) new_gdf = cudf.concat([new_gdf, _df], axis=1) # Drop temporary columns gdf.drop(columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True) return new_gdf
def transform(self, columns, gdf: cudf.DataFrame) -> cudf.DataFrame: tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how) new_gdf = new_gdf.sort_values(tmp) new_gdf.drop(columns=[tmp], inplace=True) gdf.drop(columns=[tmp], inplace=True) new_gdf.reset_index(drop=True, inplace=True) return new_gdf
def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): new_gdf = cudf.DataFrame() tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") for col, path in stats_context[self.stat_name].items(): stat_gdf = nvt_cat._read_groupby_stat_df(path, col, self.cat_cache) tran_gdf = gdf[[col, tmp]].merge(stat_gdf, on=col, how="left") tran_gdf = tran_gdf.sort_values(tmp) tran_gdf.drop(columns=[col, tmp], inplace=True) new_cols = [ c for c in tran_gdf.columns if c not in new_gdf.columns ] new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True) gdf.drop(columns=[tmp], inplace=True) return new_gdf
def apply_op( self, gdf: cudf.DataFrame, columns_ctx: dict, input_cols, target_cols=["base"], stats_context=None, ): target_columns = self.get_columns(columns_ctx, input_cols, target_cols) tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how) new_gdf = new_gdf.sort_values(tmp) new_gdf.drop(columns=[tmp], inplace=True) gdf.drop(columns=[tmp], inplace=True) new_gdf.reset_index(drop=True, inplace=True) self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns) return new_gdf
def op_logic(self, gdf: cudf.DataFrame, target_columns: list, stats_context=None): new_gdf = cudf.DataFrame() tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") if self.column_groups: cat_names, multi_col_group = nvt_cat._get_multicolumn_names( self.column_groups, gdf.columns, self.name_sep) else: multi_col_group = {} cat_names = [ name for name in target_columns if name in gdf.columns ] for name in cat_names: storage_name = self.storage_name.get(name, name) name = multi_col_group.get(name, name) path = stats_context[self.stat_name][storage_name] selection_l = name.copy() if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] stat_gdf = nvt_cat._read_groupby_stat_df(path, storage_name, self.cat_cache) tran_gdf = gdf[selection_l + [tmp]].merge(stat_gdf, left_on=selection_l, right_on=selection_r, how="left") tran_gdf = tran_gdf.sort_values(tmp) tran_gdf.drop(columns=selection_l + [tmp], inplace=True) new_cols = [ c for c in tran_gdf.columns if c not in new_gdf.columns ] new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True) gdf.drop(columns=[tmp], inplace=True) return new_gdf
def transform(self, columns: ColumnNames, gdf: cudf.DataFrame) -> cudf.DataFrame: # Add temporary column for sorting tmp = "__tmp__" gdf[tmp] = cupy.arange(len(gdf), dtype="int32") fit_folds = self.kfold > 1 if fit_folds: gdf[self.fold_name] = _add_fold(gdf.index, self.kfold, self.fold_seed) # Need mean of contiuous target column y_mean = self.target_mean or self.means # Loop over categorical-column groups and apply logic new_gdf = None for ind, cat_group in enumerate(columns): if isinstance(cat_group, tuple): cat_group = list(cat_group) elif isinstance(cat_group, str): cat_group = [cat_group] if new_gdf is None: new_gdf = self._op_group_logic(cat_group, gdf, y_mean, fit_folds, ind) else: _df = self._op_group_logic(cat_group, gdf, y_mean, fit_folds, ind) new_gdf = cudf.concat([new_gdf, _df], axis=1) # Drop temporary columns gdf.drop(columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True) if fit_folds and not self.drop_folds: new_gdf[self.fold_name] = gdf[self.fold_name] return new_gdf
def _extract_tld(input_df, suffix_df, col_len, col_dict, output_df): """ Example:- input: 4 3 2 1 0 tld4 tld3 tld2 tld1 tld0 idx 0 ac com cnn news forums ac com.ac cnn.com.ac news.cnn.com.ac forums.news.cnn.com.ac 0 1 ac cnn news forums ac cnn.ac news.cnn.ac forums.news.cnn.ac 1 2 com cnn b com cnn.com b.cnn.com 2 output: hostname domain suffix subdomain idx 0 forums.news.cnn.com.ac cnn com.ac forums.news 0 2 forums.news.cnn.ac cnn ac forums.news 1 1 b.cnn.com cnn com b 2 """ tmp_suffix_df = DataFrame() # Iterating over each tld column starting from tld0 until it finds a match. for i in range(col_len + 1): tld_col = "tld" + str(i) tmp_suffix_df[tld_col] = suffix_df["suffix"] # Left outer join input_df with tmp_suffix_df on tld column for each iteration. merged_df = input_df.merge(tmp_suffix_df, on=tld_col, how="left", suffixes=("", "_y")) col_pos = i - 1 tld_r_col = "tld%s_y" % (str(col_pos)) # Check for a right side column i.e, added to merged_df when join clause satisfies. if tld_r_col in merged_df.columns: # Retrieve records which satisfies join clause. joined_recs_df = merged_df[merged_df[tld_r_col].isna() == False] if not joined_recs_df.empty: temp_df = DataFrame() temp_df["idx"] = joined_recs_df["idx"] if col_dict["hostname"]: temp_df["hostname"] = joined_recs_df["tld0"] if col_dict["domain"]: temp_df["domain"] = joined_recs_df[col_pos] if col_dict["suffix"]: temp_df["suffix"] = joined_recs_df[tld_r_col] if col_dict["subdomain"]: temp_df["subdomain"] = "" if col_pos > 0: for idx in range(0, col_pos): temp_df["subdomain"] = temp_df[ "subdomain"].str.cat(joined_recs_df[idx], sep=".") temp_df["subdomain"] = ( temp_df["subdomain"].str.replace( ".^", "").str.lstrip(".")) # Concat current iteration result to previous iteration result. output_df = cudf.concat([temp_df, output_df]) # Assigning unprocessed records to input_df for next stage of processing. if i < col_len: # Skip for last iteration. Since there won't be any entries to process further. input_df = merged_df[merged_df[tld_r_col].isna()] # Release memory. Once tld_col column is no longer needed. tmp_suffix_df.drop(tld_col) input_df.drop(tld_col) return output_df
def train_test_split( X: cudf.DataFrame, y: Union[str, cudf.Series], train_size: Union[float, int] = 0.8, shuffle: bool = True, seed: int = None, ) -> Tuple[cudf.DataFrame, cudf.DataFrame, cudf.DataFrame, cudf.DataFrame]: """ Partitions the data into four collated dataframes, mimicing sklearn's `train_test_split` Parameters ---------- X : cudf.DataFrame Data to split, has shape (n_samples, n_features) y : str or cudf.Series Set of labels for the data, either a series of shape (n_samples) or the string label of a column in X containing the labels train_size : float or int, optional If float, represents the proportion [0, 1] of the data to be assigned to the training set. If an int, represents the number of instances to be assigned to the training set. Defaults to 0.8 shuffle : bool, optional Whether or not to shuffle inputs before splitting seed : int, optional If shuffle is true, seeds the generator. Unseeded by default Returns ------- X_train, X_test, y_train, y_test : cudf.DataFrame Partitioned dataframes. If `y` was provided as a column name, the column was dropped from the `X`s """ # TODO Use cupy indexing to support non cudf input types for X, y if isinstance(y, str): # Use the column with name `str` as y name = y y = X[name] X = X.drop(name) if X.shape[0] != y.shape[0]: raise ValueError("X and y must have the same first dimension" "(found {} and {})".format(X.shape[0], y.shape[0])) if isinstance(train_size, float): if not 0 <= train_size <= 1: raise ValueError("proportion train_size should be between" "0 and 1 (found {})".format(train_size)) split_idx = int(X.shape[0] * train_size) if isinstance(train_size, int): if not 0 <= train_size <= X.shape[0]: raise ValueError( "Number of instances train_size should be between 0 and the" "first dimension of X (found {})".format(train_size)) split_idx = train_size if seed is not None: np.random.seed(seed) # Replace Numpy/cuDF here when issue mentioned above is solved! if shuffle: idxs = np.arange(len(X)) _shuffle_idx(idxs) X = X.iloc[idxs].reset_index(drop=True) y = y.iloc[idxs].reset_index(drop=True) split_idx = int(X.shape[0] * train_size) X_train = X.iloc[0:split_idx] y_train = y.iloc[0:split_idx] X_test = X.iloc[split_idx:] y_test = y.iloc[split_idx:] return X_train, X_test, y_train, y_test
def train_test_split( X: cudf.DataFrame, y: Union[str, cudf.Series], train_size: Union[float, int] = 0.8, shuffle: bool = True, seed: int = None, ) -> Tuple[cudf.DataFrame, cudf.DataFrame, cudf.DataFrame, cudf.DataFrame]: """ Partitions the data into four collated dataframes, mimicing sklearn's `train_test_split` Parameters ---------- X : cudf.DataFrame Data to split, has shape (n_samples, n_features) y : str or cudf.Series Set of labels for the data, either a series of shape (n_samples) or the string label of a column in X containing the labels train_size : float or int, optional If float, represents the proportion [0, 1] of the data to be assigned to the training set. If an int, represents the number of instances to be assigned to the training set. Defaults to 0.8 shuffle : bool, optional Whether or not to shuffle inputs before splitting seed : int, optional If shuffle is true, seeds the generator. Unseeded by default Examples -------- .. code-block:: python import cudf from cuml.preprocessing.model_selection import train_test_split # Generate some sample data df = cudf.DataFrame({'x': range(10), 'y': [0, 1] * 5}) print(f'Original data: {df.shape[0]} elements') # Suppose we want an 80/20 split X_train, X_test, y_train, y_test = train_test_split(df, 'y', train_size=0.8) print(f'X_train: {X_train.shape[0]} elements') print(f'X_test: {X_test.shape[0]} elements') print(f'y_train: {y_train.shape[0]} elements') print(f'y_test: {y_test.shape[0]} elements') # Alternatively, if our labels are stored separately labels = df['y'] df = df.drop(['y']) # we can also do X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8) Output: .. code-block:: python Original data: 10 elements X_train: 8 elements X_test: 2 elements y_train: 8 elements y_test: 2 elements Returns ------- X_train, X_test, y_train, y_test : cudf.DataFrame Partitioned dataframes. If `y` was provided as a column name, the column was dropped from the `X`s """ # TODO Use cupy indexing to support non cudf input types for X, y if isinstance(y, str): # Use the column with name `str` as y name = y y = X[name] X = X.drop(name) if X.shape[0] != y.shape[0]: raise ValueError("X and y must have the same first dimension" "(found {} and {})".format(X.shape[0], y.shape[0])) if isinstance(train_size, float): if not 0 <= train_size <= 1: raise ValueError("proportion train_size should be between" "0 and 1 (found {})".format(train_size)) split_idx = int(X.shape[0] * train_size) if isinstance(train_size, int): if not 0 <= train_size <= X.shape[0]: raise ValueError( "Number of instances train_size should be between 0 and the" "first dimension of X (found {})".format(train_size)) split_idx = train_size if seed is not None: np.random.seed(seed) # Replace Numpy/cuDF here when issue mentioned above is solved! if shuffle: idxs = np.arange(len(X)) _shuffle_idx(idxs) X = X.iloc[idxs].reset_index(drop=True) y = y.iloc[idxs].reset_index(drop=True) split_idx = int(X.shape[0] * train_size) X_train = X.iloc[0:split_idx] y_train = y.iloc[0:split_idx] X_test = X.iloc[split_idx:] y_test = y.iloc[split_idx:] return X_train, X_test, y_train, y_test