コード例 #1
0
    def transform(self, columns: ColumnNames,
                  gdf: cudf.DataFrame) -> cudf.DataFrame:
        new_gdf = cudf.DataFrame()
        tmp = "__tmp__"  # Temporary column for sorting
        gdf[tmp] = cupy.arange(len(gdf), dtype="int32")

        cat_names, multi_col_group = nvt_cat._get_multicolumn_names(
            columns, gdf.columns, self.name_sep)

        for name in cat_names:
            storage_name = self.storage_name.get(name, name)
            name = multi_col_group.get(name, name)
            path = self.categories[storage_name]
            selection_l = list(name) if isinstance(name, tuple) else [name]
            selection_r = list(name) if isinstance(name,
                                                   tuple) else [storage_name]

            stat_gdf = nvt_cat._read_groupby_stat_df(path, storage_name,
                                                     self.cat_cache)
            tran_gdf = gdf[selection_l + [tmp]].merge(stat_gdf,
                                                      left_on=selection_l,
                                                      right_on=selection_r,
                                                      how="left")
            tran_gdf = tran_gdf.sort_values(tmp)
            tran_gdf.drop(columns=selection_l + [tmp], inplace=True)
            new_cols = [
                c for c in tran_gdf.columns if c not in new_gdf.columns
            ]
            new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True)
        gdf.drop(columns=[tmp], inplace=True)
        return new_gdf
コード例 #2
0
    def op_logic(self,
                 gdf: cudf.DataFrame,
                 target_columns: list,
                 stats_context=None):

        # Add temporary column for sorting
        tmp = "__tmp__"
        gdf[tmp] = cupy.arange(len(gdf), dtype="int32")

        # Only perform "fit" if fold column is present
        fit_folds = "__fold__" in gdf.columns

        # Need mean of contiuous target column
        y_mean = self.target_mean or stats_context["means"]

        # Loop over categorical-column groups and apply logic
        new_gdf = None
        for ind, cat_group in enumerate(self.cat_groups):
            if new_gdf is None:
                new_gdf = self._op_group_logic(cat_group, gdf, stats_context,
                                               y_mean, fit_folds, ind)
            else:
                _df = self._op_group_logic(cat_group, gdf, stats_context,
                                           y_mean, fit_folds, ind)
                new_gdf = cudf.concat([new_gdf, _df], axis=1)

        # Drop temporary columns
        gdf.drop(columns=[tmp, "__fold__"]
                 if fit_folds and self.drop_folds else [tmp],
                 inplace=True)
        return new_gdf
コード例 #3
0
 def transform(self, columns, gdf: cudf.DataFrame) -> cudf.DataFrame:
     tmp = "__tmp__"  # Temporary column for sorting
     gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
     new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how)
     new_gdf = new_gdf.sort_values(tmp)
     new_gdf.drop(columns=[tmp], inplace=True)
     gdf.drop(columns=[tmp], inplace=True)
     new_gdf.reset_index(drop=True, inplace=True)
     return new_gdf
コード例 #4
0
ファイル: ops.py プロジェクト: maruyue/NVTabular
 def op_logic(self,
              gdf: cudf.DataFrame,
              target_columns: list,
              stats_context=None):
     new_gdf = cudf.DataFrame()
     tmp = "__tmp__"  # Temporary column for sorting
     gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
     for col, path in stats_context[self.stat_name].items():
         stat_gdf = nvt_cat._read_groupby_stat_df(path, col, self.cat_cache)
         tran_gdf = gdf[[col, tmp]].merge(stat_gdf, on=col, how="left")
         tran_gdf = tran_gdf.sort_values(tmp)
         tran_gdf.drop(columns=[col, tmp], inplace=True)
         new_cols = [
             c for c in tran_gdf.columns if c not in new_gdf.columns
         ]
         new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True)
     gdf.drop(columns=[tmp], inplace=True)
     return new_gdf
コード例 #5
0
ファイル: join_external.py プロジェクト: vslyu/NVTabular
 def apply_op(
     self,
     gdf: cudf.DataFrame,
     columns_ctx: dict,
     input_cols,
     target_cols=["base"],
     stats_context=None,
 ):
     target_columns = self.get_columns(columns_ctx, input_cols, target_cols)
     tmp = "__tmp__"  # Temporary column for sorting
     gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
     new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how)
     new_gdf = new_gdf.sort_values(tmp)
     new_gdf.drop(columns=[tmp], inplace=True)
     gdf.drop(columns=[tmp], inplace=True)
     new_gdf.reset_index(drop=True, inplace=True)
     self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns)
     return new_gdf
コード例 #6
0
    def op_logic(self,
                 gdf: cudf.DataFrame,
                 target_columns: list,
                 stats_context=None):

        new_gdf = cudf.DataFrame()
        tmp = "__tmp__"  # Temporary column for sorting
        gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
        if self.column_groups:
            cat_names, multi_col_group = nvt_cat._get_multicolumn_names(
                self.column_groups, gdf.columns, self.name_sep)
        else:
            multi_col_group = {}
            cat_names = [
                name for name in target_columns if name in gdf.columns
            ]

        for name in cat_names:
            storage_name = self.storage_name.get(name, name)
            name = multi_col_group.get(name, name)
            path = stats_context[self.stat_name][storage_name]
            selection_l = name.copy() if isinstance(name, list) else [name]
            selection_r = name if isinstance(name, list) else [storage_name]

            stat_gdf = nvt_cat._read_groupby_stat_df(path, storage_name,
                                                     self.cat_cache)
            tran_gdf = gdf[selection_l + [tmp]].merge(stat_gdf,
                                                      left_on=selection_l,
                                                      right_on=selection_r,
                                                      how="left")
            tran_gdf = tran_gdf.sort_values(tmp)
            tran_gdf.drop(columns=selection_l + [tmp], inplace=True)
            new_cols = [
                c for c in tran_gdf.columns if c not in new_gdf.columns
            ]
            new_gdf[new_cols] = tran_gdf[new_cols].reset_index(drop=True)
        gdf.drop(columns=[tmp], inplace=True)
        return new_gdf
コード例 #7
0
ファイル: target_encoding.py プロジェクト: rnyak/NVTabular
    def transform(self, columns: ColumnNames,
                  gdf: cudf.DataFrame) -> cudf.DataFrame:
        # Add temporary column for sorting
        tmp = "__tmp__"
        gdf[tmp] = cupy.arange(len(gdf), dtype="int32")

        fit_folds = self.kfold > 1
        if fit_folds:
            gdf[self.fold_name] = _add_fold(gdf.index, self.kfold,
                                            self.fold_seed)

        # Need mean of contiuous target column
        y_mean = self.target_mean or self.means

        # Loop over categorical-column groups and apply logic
        new_gdf = None
        for ind, cat_group in enumerate(columns):
            if isinstance(cat_group, tuple):
                cat_group = list(cat_group)
            elif isinstance(cat_group, str):
                cat_group = [cat_group]

            if new_gdf is None:
                new_gdf = self._op_group_logic(cat_group, gdf, y_mean,
                                               fit_folds, ind)
            else:
                _df = self._op_group_logic(cat_group, gdf, y_mean, fit_folds,
                                           ind)
                new_gdf = cudf.concat([new_gdf, _df], axis=1)

        # Drop temporary columns
        gdf.drop(columns=[tmp, "__fold__"]
                 if fit_folds and self.drop_folds else [tmp],
                 inplace=True)
        if fit_folds and not self.drop_folds:
            new_gdf[self.fold_name] = gdf[self.fold_name]
        return new_gdf
コード例 #8
0
def _extract_tld(input_df, suffix_df, col_len, col_dict, output_df):
    """
    Example:- 
        input:
               4    3                2          1           0  tld4    tld3             tld2                 tld1                        tld0    idx
            0 ac  com              cnn       news      forums    ac  com.ac       cnn.com.ac      news.cnn.com.ac      forums.news.cnn.com.ac      0
            1     ac               cnn       news      forums            ac           cnn.ac          news.cnn.ac          forums.news.cnn.ac      1
            2                      com        cnn           b                            com              cnn.com                   b.cnn.com      2
    
        output:
                              hostname      domain        suffix       subdomain   idx
            0   forums.news.cnn.com.ac         cnn        com.ac     forums.news     0
            2       forums.news.cnn.ac         cnn            ac     forums.news     1
            1                b.cnn.com         cnn           com               b     2      
    """

    tmp_suffix_df = DataFrame()
    # Iterating over each tld column starting from tld0 until it finds a match.
    for i in range(col_len + 1):
        tld_col = "tld" + str(i)
        tmp_suffix_df[tld_col] = suffix_df["suffix"]
        # Left outer join input_df with tmp_suffix_df on tld column for each iteration.
        merged_df = input_df.merge(tmp_suffix_df,
                                   on=tld_col,
                                   how="left",
                                   suffixes=("", "_y"))
        col_pos = i - 1
        tld_r_col = "tld%s_y" % (str(col_pos))
        # Check for a right side column i.e, added to merged_df when join clause satisfies.
        if tld_r_col in merged_df.columns:
            # Retrieve records which satisfies join clause.
            joined_recs_df = merged_df[merged_df[tld_r_col].isna() == False]
            if not joined_recs_df.empty:
                temp_df = DataFrame()
                temp_df["idx"] = joined_recs_df["idx"]
                if col_dict["hostname"]:
                    temp_df["hostname"] = joined_recs_df["tld0"]
                if col_dict["domain"]:
                    temp_df["domain"] = joined_recs_df[col_pos]
                if col_dict["suffix"]:
                    temp_df["suffix"] = joined_recs_df[tld_r_col]
                if col_dict["subdomain"]:
                    temp_df["subdomain"] = ""
                    if col_pos > 0:
                        for idx in range(0, col_pos):
                            temp_df["subdomain"] = temp_df[
                                "subdomain"].str.cat(joined_recs_df[idx],
                                                     sep=".")
                        temp_df["subdomain"] = (
                            temp_df["subdomain"].str.replace(
                                ".^", "").str.lstrip("."))
                # Concat current iteration result to previous iteration result.
                output_df = cudf.concat([temp_df, output_df])
                # Assigning unprocessed records to input_df for next stage of processing.
                if i < col_len:
                    # Skip for last iteration. Since there won't be any entries to process further.
                    input_df = merged_df[merged_df[tld_r_col].isna()]
    # Release memory. Once tld_col column is no longer needed.
    tmp_suffix_df.drop(tld_col)
    input_df.drop(tld_col)
    return output_df
コード例 #9
0
def train_test_split(
    X: cudf.DataFrame,
    y: Union[str, cudf.Series],
    train_size: Union[float, int] = 0.8,
    shuffle: bool = True,
    seed: int = None,
) -> Tuple[cudf.DataFrame, cudf.DataFrame, cudf.DataFrame, cudf.DataFrame]:
    """
    Partitions the data into four collated dataframes, mimicing sklearn's
    `train_test_split`

    Parameters
    ----------
    X : cudf.DataFrame
        Data to split, has shape (n_samples, n_features)
    y : str or cudf.Series
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    seed : int, optional
        If shuffle is true, seeds the generator. Unseeded by default

    Returns
    -------
    X_train, X_test, y_train, y_test : cudf.DataFrame
        Partitioned dataframes. If `y` was provided as a column name, the
        column was dropped from the `X`s
    """
    # TODO Use cupy indexing to support non cudf input types for X, y
    if isinstance(y, str):
        # Use the column with name `str` as y
        name = y
        y = X[name]
        X = X.drop(name)

    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must have the same first dimension"
                         "(found {} and {})".format(X.shape[0], y.shape[0]))

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))
        split_idx = int(X.shape[0] * train_size)

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))
        split_idx = train_size

    if seed is not None:
        np.random.seed(seed)

    # Replace Numpy/cuDF here when issue mentioned above is solved!
    if shuffle:
        idxs = np.arange(len(X))
        _shuffle_idx(idxs)
        X = X.iloc[idxs].reset_index(drop=True)
        y = y.iloc[idxs].reset_index(drop=True)

    split_idx = int(X.shape[0] * train_size)
    X_train = X.iloc[0:split_idx]
    y_train = y.iloc[0:split_idx]
    X_test = X.iloc[split_idx:]
    y_test = y.iloc[split_idx:]

    return X_train, X_test, y_train, y_test
コード例 #10
0
ファイル: model_selection.py プロジェクト: trxcllnt/cuml
def train_test_split(
    X: cudf.DataFrame,
    y: Union[str, cudf.Series],
    train_size: Union[float, int] = 0.8,
    shuffle: bool = True,
    seed: int = None,
) -> Tuple[cudf.DataFrame, cudf.DataFrame, cudf.DataFrame, cudf.DataFrame]:
    """
    Partitions the data into four collated dataframes, mimicing sklearn's
    `train_test_split`

    Parameters
    ----------
    X : cudf.DataFrame
        Data to split, has shape (n_samples, n_features)
    y : str or cudf.Series
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    seed : int, optional
        If shuffle is true, seeds the generator. Unseeded by default

    Examples
    --------

    .. code-block:: python

        import cudf
        from cuml.preprocessing.model_selection import train_test_split

        # Generate some sample data
        df = cudf.DataFrame({'x': range(10),
                             'y': [0, 1] * 5})
        print(f'Original data: {df.shape[0]} elements')

        # Suppose we want an 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(df, 'y',
                                                            train_size=0.8)
        print(f'X_train: {X_train.shape[0]} elements')
        print(f'X_test: {X_test.shape[0]} elements')
        print(f'y_train: {y_train.shape[0]} elements')
        print(f'y_test: {y_test.shape[0]} elements')

        # Alternatively, if our labels are stored separately
        labels = df['y']
        df = df.drop(['y'])

        # we can also do
        X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                            train_size=0.8)

    Output:

    .. code-block:: python

        Original data: 10 elements
        X_train: 8 elements
        X_test: 2 elements
        y_train: 8 elements
        y_test: 2 elements

    Returns
    -------
    X_train, X_test, y_train, y_test : cudf.DataFrame
        Partitioned dataframes. If `y` was provided as a column name, the
        column was dropped from the `X`s
    """
    # TODO Use cupy indexing to support non cudf input types for X, y
    if isinstance(y, str):
        # Use the column with name `str` as y
        name = y
        y = X[name]
        X = X.drop(name)

    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must have the same first dimension"
                         "(found {} and {})".format(X.shape[0], y.shape[0]))

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))
        split_idx = int(X.shape[0] * train_size)

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))
        split_idx = train_size

    if seed is not None:
        np.random.seed(seed)

    # Replace Numpy/cuDF here when issue mentioned above is solved!
    if shuffle:
        idxs = np.arange(len(X))
        _shuffle_idx(idxs)
        X = X.iloc[idxs].reset_index(drop=True)
        y = y.iloc[idxs].reset_index(drop=True)

    split_idx = int(X.shape[0] * train_size)
    X_train = X.iloc[0:split_idx]
    y_train = y.iloc[0:split_idx]
    X_test = X.iloc[split_idx:]
    y_test = y.iloc[split_idx:]

    return X_train, X_test, y_train, y_test