def transform(self, col_selector: ColumnSelector,
                  df: DataFrameType) -> DataFrameType:
        new_df = type(df)()
        tmp = "__tmp__"  # Temporary column for sorting
        df[tmp] = _arange(len(df), like_df=df, dtype="int32")

        cat_names, multi_col_group = nvt_cat._get_multicolumn_names(
            col_selector, df.columns, self.name_sep)

        _read_pq_func = _read_parquet_dispatch(df)
        for name in cat_names:
            new_part = type(df)()
            storage_name = self.storage_name.get(name, name)
            name = multi_col_group.get(name, name)
            path = self.categories[storage_name]
            selection_l = list(name) if isinstance(name, tuple) else [name]
            selection_r = list(name) if isinstance(name,
                                                   tuple) else [storage_name]

            stat_df = nvt_cat._read_groupby_stat_df(path, storage_name,
                                                    self.cat_cache,
                                                    _read_pq_func)
            tran_df = df[selection_l + [tmp]].merge(stat_df,
                                                    left_on=selection_l,
                                                    right_on=selection_r,
                                                    how="left")
            tran_df = tran_df.sort_values(tmp)
            tran_df.drop(columns=selection_l + [tmp], inplace=True)
            new_cols = [c for c in tran_df.columns if c not in new_df.columns]
            new_part = tran_df[new_cols].reset_index(drop=True)
            new_df = _concat_columns([new_df, new_part])
        df.drop(columns=[tmp], inplace=True)
        return new_df
Beispiel #2
0
    def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType:
        # Add temporary column for sorting
        tmp = "__tmp__"
        df[tmp] = _arange(len(df), like_df=df, dtype="int32")

        fit_folds = self.kfold > 1
        if fit_folds:
            df[self.fold_name] = _add_fold(df.index, self.kfold, self.fold_seed)

        # Need mean of contiuous target column
        y_mean = self.target_mean or self.means

        # Loop over categorical-column groups and apply logic
        new_df = None
        for ind, cat_group in enumerate(columns):
            if isinstance(cat_group, tuple):
                cat_group = list(cat_group)
            elif isinstance(cat_group, str):
                cat_group = [cat_group]

            if new_df is None:
                new_df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind)
            else:
                _df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind)
                new_df = _concat_columns([new_df, _df])

        # Drop temporary columns
        df.drop(columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True)
        if fit_folds and not self.drop_folds:
            new_df[self.fold_name] = df[self.fold_name]
        return new_df
Beispiel #3
0
def _transform_partition(root_df, column_groups):
    """Transforms a single partition by appyling all operators in a ColumnGroup"""
    output = None
    for column_group in column_groups:
        unique_flattened_cols = _get_unique(column_group.flattened_columns)
        # collect dependencies recursively if we have parents
        if column_group.parents:
            df = None
            columns = None
            for parent in column_group.parents:
                unique_flattened_cols_parent = _get_unique(
                    parent.flattened_columns)
                parent_df = _transform_partition(root_df, [parent])
                if df is None or not len(df):
                    df = parent_df[unique_flattened_cols_parent]
                    columns = set(unique_flattened_cols_parent)
                else:
                    new_columns = set(unique_flattened_cols_parent) - columns
                    df = _concat_columns([df, parent_df[list(new_columns)]])
                    columns.update(new_columns)
        else:
            # otherwise select the input from the root df
            df = root_df[unique_flattened_cols]

        # apply the operator if necessary
        if column_group.op:
            try:
                df = column_group.op.transform(column_group.input_column_names,
                                               df)
            except Exception:
                LOG.exception("Failed to transform operator %s",
                              column_group.op)
                raise
            if df is None:
                raise RuntimeError(
                    "Operator %s didn't return a value during transform" %
                    column_group.op)

        # dask needs output to be in the same order defined as meta, reorder partitions here
        # this also selects columns (handling the case of removing columns from the output using
        # "-" overload)
        if not output:
            output = df[unique_flattened_cols]
        else:
            output = _concat_columns([output, df[unique_flattened_cols]])

    return output
Beispiel #4
0
 def _concat_tensors(self, tensors, kind):
     if kind & (Supports.GPU_DATAFRAME | Supports.CPU_DATAFRAME):
         return _concat_columns(tensors)
     else:
         output = tensors[0]
         for tensor in tensors[1:]:
             output.update(tensor)
         return output
Beispiel #5
0
def _transform_partition(root_df, workflow_nodes, additional_columns=None):
    """Transforms a single partition by appyling all operators in a WorkflowNode"""
    output = None

    for node in workflow_nodes:
        node_input_cols = _get_unique(node.input_schema.column_names)
        node_output_cols = _get_unique(node.output_schema.column_names)
        addl_input_cols = set(node.dependency_columns.names)

        # Build input dataframe
        if node.parents_with_dependencies:
            # If there are parents, collect their outputs
            # to build the current node's input
            input_df = None
            seen_columns = None

            for parent in node.parents_with_dependencies:
                parent_output_cols = _get_unique(parent.output_schema.column_names)
                parent_df = _transform_partition(root_df, [parent])
                if input_df is None or not len(input_df):
                    input_df = parent_df[parent_output_cols]
                    seen_columns = set(parent_output_cols)
                else:
                    new_columns = set(parent_output_cols) - seen_columns
                    input_df = _concat_columns([input_df, parent_df[list(new_columns)]])
                    seen_columns.update(new_columns)

            # Check for additional input columns that aren't generated by parents
            # and fetch them from the root dataframe
            unseen_columns = set(node.input_schema.column_names) - seen_columns
            addl_input_cols = addl_input_cols.union(unseen_columns)

            # TODO: Find a better way to remove dupes
            addl_input_cols = addl_input_cols - set(input_df.columns)

            if addl_input_cols:
                input_df = _concat_columns([input_df, root_df[list(addl_input_cols)]])
        else:
            # If there are no parents, this is an input node,
            # so pull columns directly from root df
            input_df = root_df[node_input_cols + list(addl_input_cols)]

        # Compute the node's output
        if node.op:
            try:
                # use input_columns to ensure correct grouping (subgroups)
                selection = node.input_columns.resolve(node.input_schema)
                output_df = node.op.transform(selection, input_df)
            except Exception:
                LOG.exception("Failed to transform operator %s", node.op)
                raise
            if output_df is None:
                raise RuntimeError("Operator %s didn't return a value during transform" % node.op)
        else:
            output_df = input_df

        # Combine output across node loop iterations

        # dask needs output to be in the same order defined as meta, reorder partitions here
        # this also selects columns (handling the case of removing columns from the output using
        # "-" overload)
        if output is None:
            output = output_df[node_output_cols]
        else:
            output = _concat_columns([output, output_df[node_output_cols]])

    if additional_columns:
        output = _concat_columns([output, root_df[_get_unique(additional_columns)]])

    return output