Exemple #1
0
    def merge_cats_encoding(self, ser, cats):
        # df and cats are both series
        # set cats to dfs
        offs = None
        if _is_list_dtype(ser.dtype) or _is_list_dtype(ser):
            ser, offs = _pull_apart_list(ser)
        ser = _make_df({"vals": ser})
        cats = _make_df({"names": cats})
        cats["vals"] = cats.index
        ser = ser.merge(cats, on=["vals"], how="left")

        return ser["names"], offs
Exemple #2
0
 def _separate_list_columns(self, gdf):
     lists, scalars = [], []
     for col in gdf.columns:
         if _is_list_dtype(gdf[col]):
             lists.append(col)
         else:
             scalars.append(col)
     return scalars, lists
Exemple #3
0
 def _separate_list_columns(self, gdf):
     lists, scalars = [], []
     for col in gdf.columns:
         if _is_list_dtype(gdf[col]):
             lists.append(col)
         else:
             scalars.append(col)
     return _get_embedding_order(scalars), _get_embedding_order(lists)
Exemple #4
0
    def fit(self, columns: ColumnNames, ddf: dd.DataFrame):
        # User passed in a list of column groups. We need to figure out
        # if this list contains any multi-column groups, and if there
        # are any (obvious) problems with these groups
        columns_uniq = list(set(flatten(columns, container=tuple)))
        columns_all = list(flatten(columns, container=tuple))
        if sorted(columns_all) != sorted(
                columns_uniq) and self.encode_type == "joint":
            # If we are doing "joint" encoding, there must be unique mapping
            # between input column names and column groups.  Otherwise, more
            # than one unique-value table could be used to encode the same
            # column.
            raise ValueError("Same column name included in multiple groups.")

        for group in columns:
            if isinstance(group, tuple) and len(group) > 1:
                # For multi-column groups, we concatenate column names
                # to get the "group" name.
                name = _make_name(*group, sep=self.name_sep)
                for col in group:
                    self.storage_name[col] = name

        # Check metadata type to reset on_host and cat_cache if the
        # underlying ddf is already a pandas-backed collection
        if isinstance(ddf._meta, pd.DataFrame):
            self.on_host = False
            # Cannot use "device" caching if the data is pandas-backed
            self.cat_cache = "host" if self.cat_cache == "device" else self.cat_cache
            if self.search_sorted:
                # Pandas' search_sorted only works with Series.
                # For now, it is safest to disallow this option.
                self.search_sorted = False
                warnings.warn(
                    "Cannot use `search_sorted=True` for pandas-backed data.")

        # convert tuples to lists
        columns = [list(c) if isinstance(c, tuple) else c for c in columns]
        dsk, key = _category_stats(
            ddf,
            columns,
            [],
            [],
            self.out_path,
            self.freq_threshold,
            self.tree_width,
            self.on_host,
            concat_groups=self.encode_type == "joint",
            name_sep=self.name_sep,
            max_size=self.max_size,
            num_buckets=self.num_buckets,
        )
        # TODO: we can't check the dtypes on the ddf here since they are incorrect
        # for cudf's list type. So, we're checking the partitions. fix.
        return Delayed(key,
                       dsk), ddf.map_partitions(lambda df: _is_list_dtype(df))
Exemple #5
0
def get_embedding_sizes(source, output_dtypes=None):
    """Returns a dictionary of embedding sizes from a workflow or column_group

    Parameters
    ----------
    source : Workflow or ColumnGroup
        Either a nvtabular Workflow or ColumnGroup object that we should use to find
        embedding sizes
    output_dtypes : dict, optional
        Optional dictionary of column_name:dtype. If passing a workflow object dtypes
        will be read from the workflow. This is used to figure out which columns
        are multihot-categorical, which are split out by this function. If passed a column_group
        and this parameter isn't set, you won't have multihot columns returned separately
    """
    # TODO: do we need to distinguish multihot columns here?  (if so why? )

    # have to lazy import Workflow to avoid circular import errors
    from nvtabular.workflow import Workflow

    if isinstance(source, Workflow):
        queue = [source.column_group]
        output_dtypes = output_dtypes or source.output_dtypes
    else:
        # passed in a column group
        queue = [source]
        output_dtypes = output_dtypes or {}

    output = {}
    multihot_columns = set()
    while queue:
        current = queue.pop()
        if current.op and hasattr(current.op, "get_embedding_sizes"):
            output.update(current.op.get_embedding_sizes(current.columns))
        elif not current.op:
            # only follow parents if its not an operator node (which could
            # transform meaning of the get_embedding_sizes
            queue.extend(current.parents)

    for column in output:
        dtype = output_dtypes.get(column)
        if dtype and _is_list_dtype(dtype):
            # multi hot so remove from output and add to multihot
            multihot_columns.add(column)
    # TODO: returning differnt return types like this (based off the presence
    # of multihot features) is pretty janky. fix.
    if not multihot_columns:
        return output

    single_hots = {
        k: v
        for k, v in output.items() if k not in multihot_columns
    }
    multi_hots = {k: v for k, v in output.items() if k in multihot_columns}
    return single_hots, multi_hots
Exemple #6
0
 def transform(self, col_selector: ColumnSelector,
               df: DataFrameType) -> DataFrameType:
     for name in col_selector.names:
         column = df[name]
         if _is_list_dtype(column):
             transformed = np.log(
                 _flatten_list_column_values(column).astype(np.float32) + 1)
             df[name] = _encode_list_column(column, transformed)
         else:
             df[name] = np.log(column.astype(np.float32) + 1)
     return df
def _cudf_to_array(df, cpu=True):
    output = {}
    for name in df.columns:
        col = df[name]
        if _is_list_dtype(col.dtype):
            offsets = col._column.offsets.values_host if cpu else col._column.offsets.values
            values = col.list.leaves.values_host if cpu else col.list.leaves.values
            output[name] = (values, offsets)
        else:
            output[name] = col.values_host if cpu else col.values

    return output
Exemple #8
0
def _chunkwise_moments(df):
    vals = {name: type(df)() for name in ["count", "sum", "squaredsum"]}
    for name in df.columns:
        column = df[name]
        if _is_list_dtype(column):
            column = _flatten_list_column_values(column)

        vals["count"][name] = [column.count()]
        vals["sum"][name] = [column.sum().astype("float64")]
        vals["squaredsum"][name] = [column.astype("float64").pow(2).sum()]

    # NOTE: Perhaps we should convert to pandas here
    # (since we know the results should be small)?
    return vals
Exemple #9
0
def _add_model_param(column, dtype, paramclass, params, dims=None):
    dims = dims if dims is not None else [-1, 1]
    if _is_list_dtype(dtype):
        params.append(
            paramclass(name=column + "__values",
                       data_type=_convert_dtype(dtype.element_type),
                       dims=dims))
        params.append(
            paramclass(name=column + "__nnzs",
                       data_type=model_config.TYPE_INT64,
                       dims=dims))
    else:
        params.append(
            paramclass(name=column, data_type=_convert_dtype(dtype),
                       dims=dims))
Exemple #10
0
 def get_row_size(self, row, cats_rep):
     """
     row = cudf.DataFrame comprising of one row
     """
     size = 0
     for col in row.columns:
         if _is_list_dtype(row[col].dtype):
             # second from last position is max list length
             # find correct cats_rep by scanning through all for column name
             tar = self.find_target_rep(col, cats_rep)
             # else use default 1
             val = tar.multi_max if tar else 1
             size = size + row[col]._column.elements.dtype.itemsize * val
         else:
             size = size + row[col].dtype.itemsize
     return size
 def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame) -> Any:
     stats = {}
     for col in col_selector.names:
         series = ddf[col]
         if _is_list_dtype(series.compute()):
             stats[col] = stats[col] if col in stats else {}
             stats[col]["value_count"] = ({}
                                          if "value_count" not in stats[col]
                                          else stats[col]["value_count"])
             offs = _pull_apart_list(series.compute())[1]
             lh, rh = offs[1:], offs[:-1]
             rh = rh.reset_index(drop=True)
             lh = lh.reset_index(drop=True)
             deltas = lh - rh
             # must be regular python class otherwise protobuf fails
             stats[col]["value_count"]["min"] = int(deltas.min())
             stats[col]["value_count"]["max"] = int(deltas.max())
     return stats
    def initialize(self, args):
        # Arg parsing
        workflow_path = os.path.join(
            args["model_repository"], str(args["model_version"]), "workflow"
        )
        model_device = args["model_instance_kind"]

        # Workflow instantiation
        self.workflow = nvtabular.Workflow.load(workflow_path)
        column_types = get_column_types(workflow_path)  # cats and conts (which duplicates tags)

        # Config loading and parsing
        self.model_config = json.loads(args["model_config"])
        model_framework = self.model_config["parameters"]["output_model"]["string_value"]

        # Dtype parsing
        input_dtypes = self.workflow.input_dtypes.items()
        self.input_dtypes, self.input_multihots = _parse_input_dtypes(input_dtypes)

        self.output_dtypes = dict()
        for name, dtype in self.workflow.output_dtypes.items():
            if not _is_list_dtype(dtype):
                self._set_output_dtype(name)
            else:
                self._set_output_dtype(name + "__nnzs")
                self._set_output_dtype(name + "__values")

        if model_framework == "hugectr":
            runner_class = HugeCTRWorkflowRunner
        elif model_framework == "pytorch":
            runner_class = PyTorchWorkflowRunner
        else:
            runner_class = TensorflowWorkflowRunner

        self.runner = runner_class(
            self.workflow, column_types, self.output_dtypes, self.model_config, model_device
        )
Exemple #13
0
def convert_df_to_triton_input(column_names,
                               batch,
                               input_class=grpcclient.InferInput):
    columns = [(col, batch[col]) for col in column_names]
    inputs = []
    for i, (name, col) in enumerate(columns):
        if _is_list_dtype(col):
            if isinstance(col, pd.Series):
                raise ValueError(
                    "this function doesn't support CPU list values yet")
            inputs.append(
                _convert_column_to_triton_input(
                    col._column.offsets.values_host.astype("int64"),
                    name + "__nnzs", input_class))
            inputs.append(
                _convert_column_to_triton_input(
                    col.list.leaves.values_host.astype("int64"),
                    name + "__values", input_class))
        else:
            values = col.values if isinstance(col,
                                              pd.Series) else col.values_host
            inputs.append(
                _convert_column_to_triton_input(values, name, input_class))
    return inputs
def _parse_input_dtypes(dtypes):
    input_dtypes = {col: dtype for col, dtype in dtypes if not _is_list_dtype(dtype)}
    input_multihots = {col: dtype for col, dtype in dtypes if _is_list_dtype(dtype)}

    return input_dtypes, input_multihots
Exemple #15
0
def _is_list_col(column_group, df):
    has_lists = any(_is_list_dtype(df[col]) for col in column_group)
    if has_lists and len(column_group) != 1:
        raise ValueError("Can't categorical encode multiple list columns")
    return has_lists