コード例 #1
0
    def create_features(
        self,
        train_df: cudf.DataFrame,
        test_df: cudf.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)

        with timer("category vectorizer"):
            new_features = []
            for i in [5, 10, 20, 30]:
                vectorizer = CategoryVectorizer(
                    categorical_columns=cat_var_list,
                    n_components=i,
                    transformer=LatentDirichletAllocation(n_components=i))
                new_feats = vectorizer.transform(total)
                new_features.append(new_feats)

            new_features = cudf.concat(new_features, axis=1)

        with timer("end"):

            self.train = new_features.iloc[:len_train].reset_index(drop=True)
            self.test = new_features.iloc[len_train:].reset_index(drop=True)
コード例 #2
0
ファイル: categorify.py プロジェクト: yashsehgal/NVTabular
    def transform(self, columns: ColumnNames,
                  gdf: cudf.DataFrame) -> cudf.DataFrame:
        new_gdf = gdf.copy(deep=False)
        if isinstance(self.freq_threshold, dict):
            assert all(x in self.freq_threshold for x in columns)

        if self.encode_type == "combo":
            # Case (3) - We want to track multi- and single-column groups separately
            #            when we are NOT performing a joint encoding. This is because
            #            there is not a 1-to-1 mapping for columns in multi-col groups.
            #            We use `multi_col_group` to preserve the list format of
            #            multi-column groups only, and use `cat_names` to store the
            #            string representation of both single- and multi-column groups.
            #
            cat_names, multi_col_group = _get_multicolumn_names(
                columns, gdf.columns, self.name_sep)
        else:
            # Case (1) & (2) - Simple 1-to-1 mapping
            multi_col_group = {}
            cat_names = list(flatten(columns, container=tuple))

        # Encode each column-group separately
        for name in cat_names:
            # Use the column-group `list` directly (not the string name)
            use_name = multi_col_group.get(name, name)

            # Storage name may be different than group for case (2)
            # Only use the "aliased" `storage_name` if we are dealing with
            # a multi-column group, or if we are doing joint encoding

            if use_name != name or self.encode_type == "joint":
                storage_name = self.storage_name.get(name, name)
            else:
                storage_name = name

            if isinstance(use_name, tuple):
                use_name = list(use_name)

            path = self.categories[storage_name]
            new_gdf[name] = _encode(
                use_name,
                storage_name,
                path,
                gdf,
                self.cat_cache,
                na_sentinel=self.na_sentinel,
                freq_threshold=self.freq_threshold[name] if isinstance(
                    self.freq_threshold, dict) else self.freq_threshold,
                search_sorted=self.search_sorted,
                buckets=self.num_buckets,
                encode_type=self.encode_type,
                cat_names=cat_names,
            )
            if self.dtype:
                new_gdf[name] = new_gdf[name].astype(self.dtype, copy=False)

        return new_gdf
コード例 #3
0
    def create_features(
        self,
        train_df: cudf.DataFrame,
        test_df: cudf.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)

        with timer("label encoding"):
            with timer("rating"):
                rating_dict = {
                    "RP": 0,
                    "EC": 1,
                    "K-A": 2,
                    "E": 2,
                    "E10+": 3,
                    "T": 4,
                    "M": 5,
                    "AO": 5,
                }
                total["Rating"] = total["Rating"].replace(rating_dict).astype(
                    int)

            with timer("other cat cols"):
                cat_cols = [
                    "Name",
                    "Platform",
                    "Genre",
                    "Publisher",
                    "Developer",
                ]
                for col in cat_cols:
                    le = LabelEncoder(handle_unknown="ignore")
                    le.fit(total[col])
                    total[col] = le.transform(total[col]).astype("category")

        with timer("User_Score"):
            total["User_Score"] = (total["User_Score"].replace(
                to_replace="tbd", value=np.nan).astype(float))

        with timer("Year_of_Release"):
            total["Year_of_Release"] = total["Year_of_Release"].replace(
                to_replace=2020.0, value=2017.0)

        with timer("log_User_Count"):
            total["log_User_Count"] = np.log1p(total["User_Count"].to_pandas())

        with timer("end"):
            basic_cols = [
                "Name",
                "Platform",
                "Year_of_Release",
                "Genre",
                "Publisher",
                "Critic_Score",
                "Critic_Count",
                "User_Score",
                "User_Count",
                "log_User_Count",
                "Developer",
                "Rating",
            ]
            target_cols = [
                "NA_Sales",
                "EU_Sales",
                "JP_Sales",
                "Other_Sales",
                "Global_Sales",
            ]
            self.train = total[basic_cols +
                               target_cols].iloc[:len_train].reset_index(
                                   drop=True)
            self.test = total[basic_cols].iloc[len_train:].reset_index(
                drop=True)
コード例 #4
0
    def apply_op(
        self,
        gdf: cudf.DataFrame,
        columns_ctx: dict,
        input_cols,
        target_cols=["base"],
        stats_context={},
    ):
        new_gdf = gdf.copy(deep=False)
        target_columns = self.get_columns(columns_ctx, input_cols, target_cols)
        if isinstance(self.freq_threshold, dict):
            assert all(x in self.freq_threshold for x in target_columns)
        if not target_columns:
            return new_gdf

        if self.column_groups and not self.encode_type == "joint":
            # Case (3) - We want to track multi- and single-column groups separately
            #            when we are NOT performing a joint encoding. This is because
            #            there is not a 1-to-1 mapping for columns in multi-col groups.
            #            We use `multi_col_group` to preserve the list format of
            #            multi-column groups only, and use `cat_names` to store the
            #            string representation of both single- and multi-column groups.
            #
            cat_names, multi_col_group = _get_multicolumn_names(
                self.column_groups, gdf.columns, self.name_sep
            )
        else:
            # Case (1) & (2) - Simple 1-to-1 mapping
            multi_col_group = {}
            cat_names = [name for name in target_columns if name in gdf.columns]

        # Encode each column-group separately
        for name in cat_names:
            new_col = f"{name}_{self._id}"

            # Use the column-group `list` directly (not the string name)
            use_name = multi_col_group.get(name, name)
            # Storage name may be different than group for case (2)
            # Only use the "aliased" `storage_name` if we are dealing with
            # a multi-column group, or if we are doing joint encoding
            if use_name != name or self.encode_type == "joint":
                storage_name = self.storage_name.get(name, name)
            else:
                storage_name = name
            path = stats_context[self.stat_name][storage_name]
            if not self.column_groups and _is_list_col([name], gdf):
                if "mh" not in columns_ctx["categorical"]:
                    columns_ctx["categorical"]["mh"] = []
                if name not in columns_ctx["categorical"]["mh"]:
                    columns_ctx["categorical"]["mh"].append(name)
            new_gdf[new_col] = _encode(
                use_name,
                storage_name,
                path,
                gdf,
                self.cat_cache,
                na_sentinel=self.na_sentinel,
                freq_threshold=self.freq_threshold[name]
                if isinstance(self.freq_threshold, dict)
                else self.freq_threshold,
                search_sorted=self.search_sorted,
                buckets=self.num_buckets,
                encode_type=self.encode_type,
                cat_names=cat_names,
            )
            if self.dtype:
                new_gdf[new_col] = new_gdf[new_col].astype(self.dtype, copy=False)

        # Deal with replacement
        if self.replace:
            for name in cat_names:
                new_col = f"{name}_{self._id}"
                new_gdf[name] = new_gdf[new_col]
                new_gdf.drop(columns=[new_col], inplace=True)

        self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns)
        return new_gdf
コード例 #5
0
ファイル: multiindex.py プロジェクト: williamBlazing/cudf
class MultiIndex(Index):
    """A multi-level or hierarchical index.

    Provides N-Dimensional indexing into Series and DataFrame objects.

    Properties
    ---
    levels: Labels for each category in the index hierarchy.
    codes: Assignment of individual items into the categories of the hierarchy.
    names: Name for each level
    """
    def __init__(self,
                 levels=None,
                 codes=None,
                 labels=None,
                 names=None,
                 **kwargs):
        from cudf.core.series import Series

        self.name = None
        self.names = names
        self._source_data = None
        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' "
                "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            self._source_data = kwargs["source_data"].reset_index(drop=True)
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
                names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        from cudf import DataFrame

        if not isinstance(codes, DataFrame) and not isinstance(
                codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError("MultiIndex has unequal number of levels and "
                             "codes and is inconsistent!")

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        self._source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            if -1 in self._codes[name].values:
                # Must account for null(s) in _source_data column
                level = DataFrame(
                    {name: [None] + list(self._levels[i])},
                    index=range(-1, len(self._levels[i])),
                )
            else:
                level = DataFrame({name: self._levels[i]})
            level = DataFrame(index=codes).join(level)
            self._source_data[name] = level[name].reset_index(drop=True)

        self.names = [None] * len(self._levels) if names is None else names

    def _validate_levels_and_codes(self, levels, codes):
        if len(levels) != len(codes.columns):
            raise ValueError("MultiIndex has unequal number of levels and "
                             "codes and is inconsistent!")
        code_length = len(codes[codes.columns[0]])
        for index, code in enumerate(codes):
            if code_length != len(codes[code]):
                raise ValueError("MultiIndex length of codes does not match "
                                 "and is inconsistent!")
        for index, code in enumerate(codes):
            if codes[code].max() > len(levels[index]) - 1:
                raise ValueError("MultiIndex code %d contains value %d larger "
                                 "than maximum level size at this position")

    def copy(self, deep=True):
        mi = MultiIndex(source_data=self._source_data.copy(deep))
        if self._levels is not None:
            mi._levels = [s.copy(deep) for s in self._levels]
        if self._codes is not None:
            mi._codes = self._codes.copy(deep)
        if self.names is not None:
            mi.names = self.names.copy()
        return mi

    def deepcopy(self):
        return self.copy(deep=True)

    def __copy__(self):
        return self.copy(deep=True)

    def _popn(self, n):
        """ Returns a copy of this index without the left-most n values.

        Removes n names, labels, and codes in order to build a new index
        for results.
        """
        from cudf import DataFrame

        codes = DataFrame()
        for idx in self.codes.columns[n:]:
            codes.add_column(idx, self.codes[idx])
        result = MultiIndex(self.levels[n:], codes)
        if self.names is not None:
            result.names = self.names[n:]
        return result

    def __repr__(self):
        return ("MultiIndex(levels=" + str(self.levels) + ",\ncodes=" +
                str(self.codes) + ")")

    @property
    def codes(self):
        if self._codes is None:
            self._compute_levels_and_codes()
        return self._codes

    @property
    def levels(self):
        if self._levels is None:
            self._compute_levels_and_codes()
        return self._levels

    @property
    def labels(self):
        warnings.warn(
            "This feature is deprecated in pandas and will be"
            "dropped from cudf as well.",
            FutureWarning,
        )
        return self.codes

    def _compute_levels_and_codes(self):
        levels = []
        from cudf import DataFrame

        codes = DataFrame()
        for name in self._source_data.columns:
            code, cats = self._source_data[name].factorize()
            codes[name] = code.reset_index(drop=True).astype(np.int64)
            cats.name = None
            cats = cats.reset_index(drop=True)._copy_construct(name=None)
            levels.append(cats)

        self._levels = levels
        self._codes = codes

    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat
        from cudf.utils.cudautils import arange

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame({"idx": Series(arange(len(index._source_data)))}),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]._column:
                    raise KeyError(row)
        return result

    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
        from cudf.utils.cudautils import arange
        from cudf import Series

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (isinstance(row_tuple.start, numbers.Number)
                    or isinstance(row_tuple.stop, numbers.Number)
                    or row_tuple == slice(None)):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return arange(start, stop, step)
            start_values = self._compute_validity_mask(index, row_tuple.start,
                                                       max_length)
            stop_values = self._compute_validity_mask(index, row_tuple.stop,
                                                      max_length)
            return Series(arange(start_values.min(), stop_values.max() + 1))
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)

    def _index_and_downcast(self, result, index, index_key):
        from cudf import DataFrame
        from cudf import Series

        if isinstance(index_key, (numbers.Number, slice)):
            index_key = [index_key]
        if (len(index_key) > 0
                and not isinstance(index_key, tuple)) or isinstance(
                    index_key[0], slice):
            index_key = index_key[0]

        slice_access = False
        if isinstance(index_key, slice):
            slice_access = True
        out_index = DataFrame()
        # Select the last n-k columns where n is the number of _source_data
        # columns and k is the length of the indexing tuple
        size = 0
        if not isinstance(index_key, (numbers.Number, slice)):
            size = len(index_key)
        for k in range(size, len(index._source_data.columns)):
            out_index.add_column(
                index.names[k],
                index._source_data[index._source_data.columns[k]],
            )

        if len(result) == 1 and size == 0 and slice_access is False:
            # If the final result is one row and it was not mapped into
            # directly, return a Series with a tuple as name.
            result = result.T
            result = result[result.columns[0]]
        elif len(result) == 0 and slice_access is False:
            # Pandas returns an empty Series with a tuple as name
            # the one expected result column
            series_name = []
            for idx, code in enumerate(index._source_data.columns):
                series_name.append(index._source_data[code][0])
            result = Series([])
            result.name = tuple(series_name)
        elif len(out_index.columns) == 1:
            # If there's only one column remaining in the output index, convert
            # it into an Index and name the final index values according
            # to the _source_data column names
            last_column = index._source_data.columns[-1]
            out_index = index._source_data[last_column]
            out_index = as_index(out_index)
            out_index.name = index.names[len(index.names) - 1]
            index = out_index
        elif len(out_index.columns) > 1:
            # Otherwise pop the leftmost levels, names, and codes from the
            # source index until it has the correct number of columns (n-k)
            result.reset_index(drop=True)
            index = index._popn(size)
        if isinstance(index_key, tuple):
            result = result.set_index(index)
        return result

    def _get_row_major(self, df, row_tuple):
        from cudf import Series

        valid_indices = self._get_valid_indices_by_tuple(
            df.index, row_tuple, len(df.index))
        indices = Series(valid_indices)
        result = df.take(indices)
        final = self._index_and_downcast(result, result.index, row_tuple)
        return final

    def _get_column_major(self, df, row_tuple):
        from cudf import Series
        from cudf import DataFrame

        valid_indices = self._get_valid_indices_by_tuple(
            df.columns, row_tuple, len(df._cols))
        result = df._take_columns(valid_indices)
        if isinstance(row_tuple, (numbers.Number, slice)):
            row_tuple = [row_tuple]
        if len(result) == 0 and len(result.columns) == 0:
            result_columns = df.columns.copy(deep=False)
            clear_codes = DataFrame()
            for name in df.columns.names:
                clear_codes[name] = Series([])
            result_columns._codes = clear_codes
            result_columns._source_data = clear_codes
            result.columns = result_columns
        elif len(row_tuple) < len(
                self.levels) and (not slice(None) in row_tuple
                                  and not isinstance(row_tuple[0], slice)):
            columns = self._popn(len(row_tuple))
            result.columns = columns.take(valid_indices)
        else:
            result.columns = self.take(valid_indices)
        if len(result.columns.levels) == 1:
            columns = []
            for code in result.columns.codes[result.columns.codes.columns[0]]:
                columns.append(result.columns.levels[0][code])
            name = result.columns.names[0]
            result.columns = as_index(columns, name=name)
        if len(row_tuple) == len(self.levels) and len(result.columns) == 1:
            result = list(result._cols.values())[0]
        return result

    def _split_tuples(self, tuples):
        if len(tuples) == 1:
            return tuples, slice(None)
        elif isinstance(tuples[0], tuple):
            row = tuples[0]
            if len(tuples) == 1:
                column = slice(None)
            else:
                column = tuples[1]
            return row, column
        elif isinstance(tuples[0], slice):
            return tuples
        else:
            return tuples, slice(None)

    def __len__(self):
        return len(self._source_data)

    def equals(self, other):
        if self is other:
            return True
        if len(self) != len(other):
            return False
        return self == other

    def __eq__(self, other):
        if not hasattr(other, "_levels"):
            return False
        # Lazy comparison
        if isinstance(other, MultiIndex) or hasattr(other, "_source_data"):
            return self._source_data.equals(other._source_data)
        else:
            # Lazy comparison isn't possible - MI was created manually.
            # Actually compare the MI, not its source data (it doesn't have
            # any).
            equal_levels = self.levels == other.levels
            if isinstance(equal_levels, np.ndarray):
                equal_levels = equal_levels.all()
            return (equal_levels and self.codes.equals(other.codes)
                    and self.names == other.names)

    @property
    def is_contiguous(self):
        return True

    @property
    def size(self):
        return len(self._source_data)

    def take(self, indices):
        from collections.abc import Sequence
        from cudf import Series
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, Series):
            if indices.null_count != 0:
                raise ValueError("Column must have no nulls.")
            indices = indices.data.mem
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = cudautils.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result

    def serialize(self):
        """Serialize into pickle format suitable for file storage or network
        transmission.
        """
        header = {}
        header["type"] = pickle.dumps(type(self))
        header["names"] = pickle.dumps(self.names)

        header["source_data"], frames = self._source_data.serialize()

        return header, frames

    @classmethod
    def deserialize(cls, header, frames):
        """Convert from pickle format into Index
        """
        names = pickle.loads(header["names"])

        source_data_typ = pickle.loads(header["source_data"]["type"])
        source_data = source_data_typ.deserialize(header["source_data"],
                                                  frames)

        names = pickle.loads(header["names"])
        return MultiIndex(names=names, source_data=source_data)

    def __iter__(self):
        self.n = 0
        return self

    def __next__(self):
        if self.n < len(self.codes):
            result = self[self.n]
            self.n += 1
            return result
        else:
            raise StopIteration

    def __getitem__(self, index):
        # TODO: This should be a take of the _source_data only
        match = self.take(index)
        if isinstance(index, slice):
            return match
        result = []
        for level, item in enumerate(match.codes):
            result.append(match.levels[level][match.codes[item][0]])
        return tuple(result)

    def to_frame(self, index=True, name=None):
        df = self._source_data
        if index:
            df = df.set_index(self)
        if name:
            if len(name) != len(self.levels):
                raise ValueError("'name' should have th same length as "
                                 "number of levels on index.")
            df.columns = name
        return df

    def get_level_values(self, level):
        colnames = list(self._source_data.columns)
        if level not in colnames:
            if isinstance(level, int):
                if level < 0:
                    level = level + len(colnames)
                if level < 0 or level >= len(colnames):
                    raise IndexError(f"Invalid level number: '{level}'")
                level_idx = level
                level = colnames[level_idx]
            else:
                raise KeyError(f"Level not found: '{level}'")
        level_values = self._source_data[level]
        return level_values

    def _to_frame(self):
        from cudf import DataFrame, Series

        # for each column of codes
        # replace column with mapping from integers to levels
        df = self.codes.copy(deep=False)
        for idx, col in enumerate(df.columns):
            # use merge as a replace fn
            level = DataFrame({
                "idx":
                Series(
                    cudautils.arange(len(self.levels[idx]),
                                     dtype=df[col].dtype)),
                "level":
                self.levels[idx],
            })
            code = DataFrame({"idx": df[col]})
            df[col] = code.merge(level).level
        return df

    @property
    def _values(self):
        return list([i for i in self])

    @classmethod
    def _concat(cls, objs):
        from cudf import DataFrame, MultiIndex

        source_data = [o._source_data for o in objs]
        source_data = DataFrame._concat(source_data)
        names = [None for x in source_data.columns]
        objs = list(filter(lambda o: o.names is not None, objs))
        for o in range(len(objs)):
            for i, name in enumerate(objs[o].names):
                names[i] = names[i] or name
        return MultiIndex(names=names, source_data=source_data)

    @classmethod
    def from_tuples(cls, tuples, names=None):
        # Use Pandas for handling Python host objects
        pdi = pd.MultiIndex.from_tuples(tuples, names=names)
        result = cls.from_pandas(pdi)
        return result

    @classmethod
    def from_frame(cls, dataframe, names=None):
        return cls(source_data=dataframe, names=names)

    @classmethod
    def from_product(cls, arrays, names=None):
        # Use Pandas for handling Python host objects
        pdi = pd.MultiIndex.from_product(arrays, names=names)
        result = cls.from_pandas(pdi)
        return result

    def to_pandas(self):
        pandas_codes = []
        for code in self.codes.columns:
            pandas_codes.append(self.codes[code].to_array())

        # We do two things here to mimic Pandas behavior:
        # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex
        # 2. convert levels to numpy array so empty levels become Float64Index
        levels = np.array(
            [as_index(level).to_pandas() for level in self.levels])

        # Backwards compatibility:
        # Construct a dummy MultiIndex and check for the codes attr.
        # This indicates that it is pandas >= 0.24
        # If no codes attr is present it is pandas <= 0.23
        if hasattr(pd.MultiIndex([[]], [[]]), "codes"):
            pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes)
        else:
            pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes)
        if self.names is not None:
            pandas_mi.names = self.names
        return pandas_mi

    @classmethod
    def from_pandas(cls, multiindex):
        """
        Convert from a Pandas MultiIndex

        Raises
        ------
        TypeError for invalid input type.

        Examples
        --------
        >>> import cudf
        >>> import pandas as pd
        >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']],
                                codes=[[0, 1], [1, ]])
        >>> cudf.from_pandas(pmi)
        MultiIndex( ... )
        """
        if not isinstance(multiindex, pd.MultiIndex):
            raise TypeError("not a pandas.MultiIndex")

        if hasattr(multiindex, "codes"):
            mi = cls(
                levels=multiindex.levels,
                codes=multiindex.codes,
                names=multiindex.names,
            )
        else:
            mi = cls(
                levels=multiindex.levels,
                codes=multiindex.labels,
                names=multiindex.names,
            )
        return mi

    @property
    def is_unique(self):
        if not hasattr(self, "_is_unique"):
            self._is_unique = (self._source_data._size ==
                               self._source_data.drop_duplicates()._size)
        return self._is_unique

    @property
    def is_monotonic_increasing(self):
        if not hasattr(self, "_is_monotonic_increasing"):
            self._is_monotonic_increasing = libcudf.issorted.issorted(
                self._source_data._columns)
        return self._is_monotonic_increasing

    @property
    def is_monotonic_decreasing(self):
        if not hasattr(self, "_is_monotonic_decreasing"):
            self._is_monotonic_decreasing = libcudf.issorted.issorted(
                self._source_data._columns, [1] * len(self.levels))
        return self._is_monotonic_decreasing

    def repeat(self, repeats, axis=None):
        assert axis in (None, 0)
        return MultiIndex.from_frame(self._source_data.repeat(repeats),
                                     names=self.names)
コード例 #6
0
class MultiIndex(Index):
    """A multi-level or hierarchical index.

    Provides N-Dimensional indexing into Series and DataFrame objects.

    Properties
    ---
    levels: Labels for each category in the index hierarchy.
    codes: Assignment of individual items into the categories of the hierarchy.
    names: Name for each level
    """

    def __init__(
        self, levels=None, codes=None, labels=None, names=None, **kwargs
    ):
        from cudf.core.series import Series
        from cudf import DataFrame

        super().__init__()

        self._name = None

        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' " "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            source_data = kwargs["source_data"].copy(deep=False)
            source_data.reset_index(drop=True, inplace=True)

            if isinstance(source_data, pd.DataFrame):
                nan_as_null = kwargs.get("nan_as_null", None)
                source_data = DataFrame.from_pandas(
                    source_data, nan_as_null=nan_as_null
                )
            names = names if names is not None else source_data._data.names
            # if names are unique
            # try using those as the source_data column names:
            if len(dict.fromkeys(names)) == len(names):
                source_data.columns = names
            self._data = source_data._data
            self.names = names
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
            names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        if not isinstance(codes, DataFrame) and not isinstance(
            codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)
        ):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError(
                "MultiIndex has unequal number of levels and "
                "codes and is inconsistent!"
            )

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            if -1 in self._codes[name].values:
                # Must account for null(s) in _source_data column
                level = DataFrame(
                    {name: [None] + list(self._levels[i])},
                    index=range(-1, len(self._levels[i])),
                )
            else:
                level = DataFrame({name: self._levels[i]})

            import cudf._lib as libcudf

            source_data[name] = libcudf.copying.gather(
                level, codes._data.columns[0]
            )._data[name]

        self._data = source_data._data
        self.names = names

    @property
    def names(self):
        return self._names

    @names.setter
    def names(self, value):
        value = [None] * self.nlevels if value is None else value
        assert len(value) == self.nlevels
        self._names = pd.core.indexes.frozen.FrozenList(value)

    @classmethod
    def _from_table(cls, table, names=None):
        df = cudf.DataFrame(table._data)
        if names is None:
            names = df.columns
        return MultiIndex.from_frame(df, names=names)

    @property
    def _source_data(self):
        return cudf.DataFrame(self._data)

    @_source_data.setter
    def _source_data(self, value):
        self._data = value._data
        self._compute_levels_and_codes()

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._name = value

    def _validate_levels_and_codes(self, levels, codes):
        if len(levels) != len(codes.columns):
            raise ValueError(
                "MultiIndex has unequal number of levels and "
                "codes and is inconsistent!"
            )
        code_length = len(codes[codes.columns[0]])
        for index, code in enumerate(codes):
            if code_length != len(codes[code]):
                raise ValueError(
                    "MultiIndex length of codes does not match "
                    "and is inconsistent!"
                )
        for index, code in enumerate(codes):
            if codes[code].max() > len(levels[index]) - 1:
                raise ValueError(
                    "MultiIndex code %d contains value %d larger "
                    "than maximum level size at this position"
                )

    def copy(self, deep=True):
        mi = MultiIndex(source_data=self._source_data.copy(deep))
        if self._levels is not None:
            mi._levels = [s.copy(deep) for s in self._levels]
        if self._codes is not None:
            mi._codes = self._codes.copy(deep)
        if self.names is not None:
            mi.names = self.names.copy()
        return mi

    def deepcopy(self):
        return self.copy(deep=True)

    def __copy__(self):
        return self.copy(deep=True)

    def _popn(self, n):
        """ Returns a copy of this index without the left-most n values.

        Removes n names, labels, and codes in order to build a new index
        for results.
        """
        result = MultiIndex(source_data=self._source_data.iloc[:, n:])
        if self.names is not None:
            result.names = self.names[n:]
        return result

    def __repr__(self):
        return (
            "MultiIndex(levels="
            + str(self.levels)
            + ",\ncodes="
            + str(self.codes)
            + ")"
        )

    @property
    def codes(self):
        if self._codes is None:
            self._compute_levels_and_codes()
        return self._codes

    @property
    def nlevels(self):
        return self._source_data.shape[1]

    @property
    def levels(self):
        if self._levels is None:
            self._compute_levels_and_codes()
        return self._levels

    @property
    def labels(self):
        warnings.warn(
            "This feature is deprecated in pandas and will be"
            "dropped from cudf as well.",
            FutureWarning,
        )
        return self.codes

    @property
    def ndim(self):
        """Dimension of the data. For MultiIndex ndim is always 2.
        """
        return 2

    def isin(self, values, level=None):
        """Return a boolean array where the index values are in values.

        Compute boolean array of whether each index value is found in
        the passed set of values. The length of the returned boolean
        array matches the length of the index.

        Parameters
        ----------
        values : set, list-like, Index or Multi-Index
            Sought values.
        level : str or int, optional
            Name or position of the index level to use (if the index
            is a MultiIndex).
        Returns
        -------
        is_contained : cupy array
            CuPy array of boolean values.
        Notes
        -------
        When `level` is None, `values` can only be MultiIndex, or a
        set/list-like tuples.
        When `level` is provided, `values` can be Index or MultiIndex,
        or a set/list-like tuples.
        """
        from cudf.utils.dtypes import is_list_like

        if level is None:
            if isinstance(values, cudf.MultiIndex):
                values_idx = values
            elif (
                (
                    isinstance(
                        values,
                        (
                            cudf.Series,
                            cudf.Index,
                            cudf.DataFrame,
                            column.ColumnBase,
                        ),
                    )
                )
                or (not is_list_like(values))
                or (
                    is_list_like(values)
                    and len(values) > 0
                    and not isinstance(values[0], tuple)
                )
            ):
                raise TypeError(
                    "values need to be a Multi-Index or set/list-like tuple \
                        squences  when `level=None`."
                )
            else:
                values_idx = cudf.MultiIndex.from_tuples(
                    values, names=self.names
                )

            res = []
            for name in self.names:
                level_idx = self.get_level_values(name)
                value_idx = values_idx.get_level_values(name)

                existence = level_idx.isin(value_idx)
                res.append(existence)

            result = res[0]
            for i in res[1:]:
                result = result & i
        else:
            level_series = self.get_level_values(level)
            result = level_series.isin(values)

        return result

    def mask(self, cond, other=None, inplace=False):
        raise NotImplementedError(
            ".mask is not supported for MultiIndex operations"
        )

    def where(self, cond, other=None, inplace=False):
        raise NotImplementedError(
            ".where is not supported for MultiIndex operations"
        )

    def _compute_levels_and_codes(self):
        levels = []
        from cudf import DataFrame

        codes = DataFrame()
        for name in self._source_data.columns:
            code, cats = self._source_data[name].factorize()
            codes[name] = code.reset_index(drop=True).astype(np.int64)
            cats.name = None
            cats = cats.reset_index(drop=True)._copy_construct(name=None)
            levels.append(cats)

        self._levels = levels
        self._codes = codes

    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if isinstance(row, slice) and row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame(
                    {"idx": Series(cupy.arange(len(index._source_data)))}
                ),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]._column:
                    raise KeyError(row)
        return result

    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
        from cudf import Series

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (
                isinstance(row_tuple.start, numbers.Number)
                or isinstance(row_tuple.stop, numbers.Number)
                or row_tuple == slice(None)
            ):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return cupy.arange(start, stop, step)
            start_values = self._compute_validity_mask(
                index, row_tuple.start, max_length
            )
            stop_values = self._compute_validity_mask(
                index, row_tuple.stop, max_length
            )
            return Series(
                cupy.arange(start_values.min(), stop_values.max() + 1)
            )
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)

    def _index_and_downcast(self, result, index, index_key):
        from cudf import DataFrame
        from cudf import Series

        if isinstance(index_key, (numbers.Number, slice)):
            index_key = [index_key]
        if (
            len(index_key) > 0 and not isinstance(index_key, tuple)
        ) or isinstance(index_key[0], slice):
            index_key = index_key[0]

        slice_access = False
        if isinstance(index_key, slice):
            slice_access = True
        out_index = DataFrame()
        # Select the last n-k columns where n is the number of _source_data
        # columns and k is the length of the indexing tuple
        size = 0
        if not isinstance(index_key, (numbers.Number, slice)):
            size = len(index_key)
        for k in range(size, len(index._source_data.columns)):
            if index.names is None:
                name = k
            else:
                name = index.names[k]
            out_index.insert(
                len(out_index.columns),
                name,
                index._source_data[index._source_data.columns[k]],
            )

        if len(result) == 1 and size == 0 and slice_access is False:
            # If the final result is one row and it was not mapped into
            # directly, return a Series with a tuple as name.
            result = result.T
            result = result[result._data.names[0]]
        elif len(result) == 0 and slice_access is False:
            # Pandas returns an empty Series with a tuple as name
            # the one expected result column
            series_name = []
            for idx, code in enumerate(index._source_data.columns):
                series_name.append(index._source_data[code][0])
            result = Series([])
            result.name = tuple(series_name)
        elif len(out_index.columns) == 1:
            # If there's only one column remaining in the output index, convert
            # it into an Index and name the final index values according
            # to the _source_data column names
            last_column = index._source_data.columns[-1]
            out_index = index._source_data[last_column]
            out_index = as_index(out_index)
            out_index.name = index.names[len(index.names) - 1]
            index = out_index
        elif len(out_index.columns) > 1:
            # Otherwise pop the leftmost levels, names, and codes from the
            # source index until it has the correct number of columns (n-k)
            result.reset_index(drop=True)
            index = index._popn(size)
        if isinstance(index_key, tuple):
            result = result.set_index(index)
        return result

    def _get_row_major(self, df, row_tuple):
        from cudf import Series

        if pd.api.types.is_bool_dtype(row_tuple):
            return df[row_tuple]

        valid_indices = self._get_valid_indices_by_tuple(
            df.index, row_tuple, len(df.index)
        )
        indices = Series(valid_indices)
        result = df.take(indices)
        final = self._index_and_downcast(result, result.index, row_tuple)
        return final

    def _split_tuples(self, tuples):
        if len(tuples) == 1:
            return tuples, slice(None)
        elif isinstance(tuples[0], tuple):
            row = tuples[0]
            if len(tuples) == 1:
                column = slice(None)
            else:
                column = tuples[1]
            return row, column
        elif isinstance(tuples[0], slice):
            return tuples
        else:
            return tuples, slice(None)

    def __len__(self):
        return len(next(iter(self._data.columns)))

    def equals(self, other):
        if self is other:
            return True
        if len(self) != len(other):
            return False
        return self == other

    def __eq__(self, other):
        if not hasattr(other, "_levels"):
            return False
        # Lazy comparison
        if isinstance(other, MultiIndex) or hasattr(other, "_source_data"):
            for self_col, other_col in zip(
                self._source_data._data.values(),
                other._source_data._data.values(),
            ):
                if not self_col.equals(other_col):
                    return False
            return self.names == other.names
        else:
            # Lazy comparison isn't possible - MI was created manually.
            # Actually compare the MI, not its source data (it doesn't have
            # any).
            equal_levels = self.levels == other.levels
            if isinstance(equal_levels, np.ndarray):
                equal_levels = equal_levels.all()
            return (
                equal_levels
                and self.codes.equals(other.codes)
                and self.names == other.names
            )

    @property
    def is_contiguous(self):
        return True

    @property
    def size(self):
        return len(self._source_data)

    def take(self, indices):
        from collections.abc import Sequence
        from cudf import Series
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, Series):
            if indices.has_nulls:
                raise ValueError("Column must have no nulls.")
            indices = indices
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = cupy.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result

    def serialize(self):
        """Serialize into pickle format suitable for file storage or network
        transmission.
        """
        header = {}
        header["type-serialized"] = pickle.dumps(type(self))
        header["names"] = pickle.dumps(self.names)

        header["source_data"], frames = self._source_data.serialize()

        return header, frames

    @classmethod
    def deserialize(cls, header, frames):
        """Convert from pickle format into Index
        """
        names = pickle.loads(header["names"])

        source_data_typ = pickle.loads(
            header["source_data"]["type-serialized"]
        )
        source_data = source_data_typ.deserialize(
            header["source_data"], frames
        )

        names = pickle.loads(header["names"])
        return MultiIndex(names=names, source_data=source_data)

    def __iter__(self):
        self.n = 0
        return self

    def __next__(self):
        if self.n < len(self.codes):
            result = self[self.n]
            self.n += 1
            return result
        else:
            raise StopIteration

    def __getitem__(self, index):
        # TODO: This should be a take of the _source_data only
        match = self.take(index)
        if isinstance(index, slice):
            return match
        result = []
        for level, item in enumerate(match.codes):
            result.append(match.levels[level][match.codes[item].iloc[0]])
        return tuple(result)

    def to_frame(self, index=True, name=None):
        df = self._source_data
        if index:
            df = df.set_index(self)
        if name is not None:
            if len(name) != len(self.levels):
                raise ValueError(
                    "'name' should have th same length as "
                    "number of levels on index."
                )
            df.columns = name
        return df

    def get_level_values(self, level):
        """
        Return the values at the requested level

        Parameters
        ----------
        level : int or label

        Returns
        -------
        An Index containing the values at the requested level.
        """
        colnames = list(self._source_data.columns)
        if level not in colnames:
            if isinstance(level, int):
                if level < 0:
                    level = level + len(colnames)
                if level < 0 or level >= len(colnames):
                    raise IndexError(f"Invalid level number: '{level}'")
                level_idx = level
                level = colnames[level_idx]
            elif level in self.names:
                level_idx = list(self.names).index(level)
                level = colnames[level_idx]
            else:
                raise KeyError(f"Level not found: '{level}'")
        else:
            level_idx = colnames.index(level)
        level_values = as_index(
            self._source_data._data[level], name=self.names[level_idx]
        )
        return level_values

    def _to_frame(self):
        from cudf import DataFrame, Series

        # for each column of codes
        # replace column with mapping from integers to levels
        df = self.codes.copy(deep=False)
        for idx, col in enumerate(df.columns):
            # use merge as a replace fn
            level = DataFrame(
                {
                    "idx": Series(
                        cupy.arange(len(self.levels[idx]), dtype=df[col].dtype)
                    ),
                    "level": self.levels[idx],
                }
            )
            code = DataFrame({"idx": df[col]})
            df[col] = code.merge(level).level
        return df

    @property
    def _values(self):
        return list([i for i in self])

    @classmethod
    def _concat(cls, objs):
        from cudf import DataFrame, MultiIndex

        source_data = [o._source_data for o in objs]
        source_data = DataFrame._concat(source_data)
        names = [None for x in source_data.columns]
        objs = list(filter(lambda o: o.names is not None, objs))
        for o in range(len(objs)):
            for i, name in enumerate(objs[o].names):
                names[i] = names[i] or name
        return MultiIndex(names=names, source_data=source_data)

    @classmethod
    def from_tuples(cls, tuples, names=None):
        # Use Pandas for handling Python host objects
        pdi = pd.MultiIndex.from_tuples(tuples, names=names)
        result = cls.from_pandas(pdi)
        return result

    @classmethod
    def from_frame(cls, dataframe, names=None):
        return cls(source_data=dataframe, names=names)

    @classmethod
    def from_product(cls, arrays, names=None):
        # Use Pandas for handling Python host objects
        pdi = pd.MultiIndex.from_product(arrays, names=names)
        result = cls.from_pandas(pdi)
        return result

    def to_pandas(self):
        if hasattr(self, "_source_data"):
            result = self._source_data.to_pandas()
            result.columns = self.names
            return pd.MultiIndex.from_frame(result)

        pandas_codes = []
        for code in self.codes.columns:
            pandas_codes.append(self.codes[code].to_array())

        # We do two things here to mimic Pandas behavior:
        # 1. as_index() on each level, so DatetimeColumn becomes DatetimeIndex
        # 2. convert levels to numpy array so empty levels become Float64Index
        levels = np.array(
            [as_index(level).to_pandas() for level in self.levels]
        )

        # Backwards compatibility:
        # Construct a dummy MultiIndex and check for the codes attr.
        # This indicates that it is pandas >= 0.24
        # If no codes attr is present it is pandas <= 0.23
        if hasattr(pd.MultiIndex([[]], [[]]), "codes"):
            pandas_mi = pd.MultiIndex(levels=levels, codes=pandas_codes)
        else:
            pandas_mi = pd.MultiIndex(levels=levels, labels=pandas_codes)
        if self.names is not None:
            pandas_mi.names = self.names
        return pandas_mi

    @classmethod
    def from_pandas(cls, multiindex, nan_as_null=None):
        """
        Convert from a Pandas MultiIndex

        Raises
        ------
        TypeError for invalid input type.

        Examples
        --------
        >>> import cudf
        >>> import pandas as pd
        >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']],
                                codes=[[0, 1], [1, ]])
        >>> cudf.from_pandas(pmi)
        MultiIndex( ... )
        """
        if not isinstance(multiindex, pd.MultiIndex):
            raise TypeError("not a pandas.MultiIndex")

        mi = cls(
            names=multiindex.names,
            source_data=multiindex.to_frame(),
            nan_as_null=nan_as_null,
        )

        return mi

    @property
    def is_unique(self):
        if not hasattr(self, "_is_unique"):
            self._is_unique = len(self._source_data) == len(
                self._source_data.drop_duplicates(ignore_index=True)
            )
        return self._is_unique

    @property
    def is_monotonic_increasing(self):
        if not hasattr(self, "_is_monotonic_increasing"):
            self._is_monotonic_increasing = self._is_sorted(
                ascending=None, null_position=None
            )
        return self._is_monotonic_increasing

    @property
    def is_monotonic_decreasing(self):
        if not hasattr(self, "_is_monotonic_decreasing"):
            self._is_monotonic_decreasing = self._is_sorted(
                ascending=[False] * len(self.levels), null_position=None
            )
        return self._is_monotonic_decreasing

    def argsort(self, ascending=True):
        return self._source_data.argsort(ascending=ascending)

    def unique(self):
        return MultiIndex.from_frame(self._source_data.drop_duplicates())

    def memory_usage(self, deep=False):
        n = 0
        for col in self._source_data._columns:
            n += col._memory_usage(deep=deep)
        if self._levels:
            for level in self._levels:
                n += level.memory_usage(deep=deep)
        if self._codes:
            for col in self._codes._columns:
                n += col._memory_usage(deep=deep)
        return n

    def difference(self, other, sort=None):
        temp_self = self
        temp_other = other
        if hasattr(self, "to_pandas"):
            temp_self = self.to_pandas()
        if hasattr(other, "to_pandas"):
            temp_other = self.to_pandas()
        return temp_self.difference(temp_other, sort)

    def nan_to_num(*args, **kwargs):
        return args[0]

    def array_equal(*args, **kwargs):
        return args[0] == args[1]

    def __array_function__(self, func, types, args, kwargs):
        cudf_df_module = MultiIndex

        for submodule in func.__module__.split(".")[1:]:
            # point cudf to the correct submodule
            if hasattr(cudf_df_module, submodule):
                cudf_df_module = getattr(cudf_df_module, submodule)
            else:
                return NotImplemented

        fname = func.__name__

        handled_types = [cudf_df_module, np.ndarray]

        for t in types:
            if t not in handled_types:
                return NotImplemented

        if hasattr(cudf_df_module, fname):
            cudf_func = getattr(cudf_df_module, fname)
            # Handle case if cudf_func is same as numpy function
            if cudf_func is func:
                return NotImplemented
            else:
                return cudf_func(*args, **kwargs)
        else:
            return NotImplemented

    def _mimic_inplace(self, other, inplace=False):
        if inplace is True:
            for in_col, oth_col in zip(
                self._source_data._columns, other._source_data._columns,
            ):
                in_col._mimic_inplace(oth_col, inplace=True)
        else:
            return other
コード例 #7
0
ファイル: create_train_test.py プロジェクト: karunru/atmaCup8
def merge_all(
    df: cudf.DataFrame,
    campaign: cudf.DataFrame,
    map_game_feed_native_video_assets: cudf.DataFrame,
    advertiser_video: cudf.DataFrame,
    advertiser_converted_video: cudf.DataFrame,
) -> cudf.DataFrame:
    # merge df and campaign
    res = cudf.merge(
        df, campaign, left_on="campaign_id", right_on="id", how="left"
    ).drop(
        columns=["id", "mst_advertiser_id"]
    )  # remove campaign keys

    # merge res and map_game_feed_native_video_assets
    res = cudf.merge(
        res,
        map_game_feed_native_video_assets,
        left_on="game_feed_id",
        right_on="mst_game_feed_id",
        how="left",
    ).drop(
        columns=["mst_game_feed_id"]
    )  # remove map_game_feed_native_video_assets keys

    # merge res and advertiser_video (horizontal case)
    horizontal = advertiser_video.copy()
    left_keys = ["horizontal_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    horizontal.columns = [
        f"horizontal_{c}" if c not in right_keys else c for c in horizontal.columns
    ]
    res = cudf.merge(
        res, horizontal, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_video keys

    # merge res and advertiser_video (vertical case)
    vertical = advertiser_video.copy()
    left_keys = ["vertical_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    vertical.columns = [
        f"vertical_{c}" if c not in right_keys else c for c in vertical.columns
    ]
    res = cudf.merge(
        res, vertical, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_video keys

    # merge res and advertiser_converted_video (horizontal case)
    left_keys = [
        "horizontal_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    horizontal = advertiser_converted_video.copy()
    horizontal.columns = [
        f"horizontal_converted_{c}" if c not in right_keys else c
        for c in horizontal.columns
    ]
    res = cudf.merge(
        res, horizontal, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_converted_video keys

    # merge res and advertiser_converted_video (vertical case)
    left_keys = [
        "vertical_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    vertical = advertiser_converted_video.copy()
    vertical.columns = [
        f"vertical_converted_{c}" if c not in right_keys else c
        for c in vertical.columns
    ]
    res = cudf.merge(
        res, vertical, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_converted_video keys

    return res