def drop_duplicate_rows(data: dd = None, subset: List[str] = None, keep: str = None) -> dd: """ Drop rows containing duplicate data for the specified subset of columns :param data: dask dataframe :param subset: list of column names :param keep: which duplicate to keep :return: modified dask dataframe """ return data.drop_duplicates(subset=subset, keep=keep)
def transform(self, X: dd, y=None): """ Remove duplicated rows Args: X (dd): Dataframe to be processed y (dd, optional): Target. Defaults to None. Returns: (dd): Dataframe with rows removed """ return X.drop_duplicates(subset=self.subset)