Ejemplo n.º 1
0
 def __init__(self, *args, groups=False, cols_init=None, **kwargs):
     if cols_init:
         args = flatten_list([args, cols_init], recursive=False)
     if groups:
         args = [flatten_list([c]) for c in args]
     self.col_groups_init = args
     self.groups = groups
     super().__init__(*args, **kwargs)
Ejemplo n.º 2
0
    def fit(self, df, y=None):
        """Fit the transformer.

        Args:
            df (pandas.DataFrame): Dataframe used to fit the transformation.
        """

        cols = fit_columns(df, self.cols_init, self.dtype,
                self.cols_not_found_error)
        exclude = fit_columns(df, self.exclude, None,
                self.cols_not_found_error) if self.exclude else []
        self.cols = [i for i in cols if i not in exclude]
        self._fit(df)
        # Save column maps and lists.
        self.col_map = self.get_column_mapping()
        # Recreate cols, just in case you overwrite the get_column_mapping
        # but you didn't specify any cols_init.
        self.cols = list(collections.OrderedDict.fromkeys(
            flatten_list(self.col_map.keys())))
        col_map_1_n, col_map_1_n_inverse = {}, {}
        for k, v in self.col_map.items():
            add_to_map_dict(col_map_1_n, k, v)
            add_to_map_dict(col_map_1_n_inverse, v, k)
        self.col_map_1_n = col_map_1_n
        self.col_map_1_n_inverse = col_map_1_n_inverse
        self.cols_out = list(collections.OrderedDict.fromkeys(
            flatten_list(self.col_map.values())))
        # Cols in input columns and output columns should be removed from
        # df_in during the transform phase.
        # We join df_in with df_out, so we do not want duplicate column names.
        self.cols_in_out = set(self.cols).intersection(set(self.cols_out))

        if self.keep_original and self.cols_in_out:
            raise ValueError("Rename the output columns if you want to keep "
                             "the original columns, name collisions in "
                             f"{self.cols_in_out}")

        return self
Ejemplo n.º 3
0
    def test_flatten_no_recursion(self):
        """One level of nested iterators. """

        a = ["c1", ["c2", set(["c3"])]]
        b = ["c1", "c2", set(["c3"])]
        self.assertEqual(flatten_list(a, recursive=False), b)
Ejemplo n.º 4
0
    def test_flatten_multi_level(self):
        """More than one level of nested iterators, testing recursivity. """

        a = ["c1", ["c2", (["c3", ("c4", [[set(["c5"])], "c6"])])]]
        b = ["c1", "c2", "c3", "c4", "c5", "c6"]
        self.assertEqual(flatten_list(a), b)
Ejemplo n.º 5
0
    def test_flatten_one_nested_level(self):
        """One level of nested iterators. """

        a = ["c1", ["c2"], set(["c3"]), ("c4", "c5")]
        b = ["c1", "c2", "c3", "c4", "c5"]
        self.assertEqual(flatten_list(a), b)
Ejemplo n.º 6
0
    def __init__(self, *args, cols_init=None, exclude=None, dtype=None,
                 name=None, keep_original=False, col_format="{}",
                 cols_not_found_error=False):
        """Create a new transformer.

        Columns names can be use Unix filename pattern matching (
        :obj:`fnmatch`).

        Args:
            *args (:obj:`list` of :obj:`str`): List of columns the transformer
                will work on.
            cols_init (:obj:`list` of :obj:`str`): List of columns the
                transformer will work on. If `*args` are provided, this list
                of columns is going to be appended at the end.
            exclude (:obj:`list` of :obj:`str`): List of columns to exclude.
                The exclusion is applied after fitting the columns, so it can
                be used at the same time as `*args` and `col_init`.
            dtype (:obj:`numpy.dtype`, :obj:`str`, :obj:`list` of
                :obj:`numpy.dtype` or with :obj:`str` or :obj:`dict`): This
                value is passed to :obj:`pandas.DataFrame.select_dtypes`.
                If a :obj:`dict` is given, the Pandas function is going to be
                called with dictionary unpacking: `select_dtypes(**dtype)`.
                In this way you can exclude, for example, int dtypes using:
                `dtype=dict(exclude=int)`.
                The columns returned by this method (executed in the DataFrame
                passed to the fit method) will be the columns that are going
                to be used in the transformation phase.
                When used in combination with `*args` or `cols_init`, the dtype
                filter is applied later.
            name (:obj:`str`): Human-friendly name of the transformer.
            keep_original (:obj:`bool`): `True` if you want to keep the input
                columns used in the transformer in the transformed DataFrame,
                `False` if not.
                Note that, if the output column has the same name as the input
                column, the output input column will not be included even if
                `keep_original` is set to `True`.
                Default: `False`.
            col_format (:obj:`str`): New name of the columns. Use "{}" in to
                substitute that placeholder by the column name. For example, if
                you want to append the string "_new" at the end of all the
                generated columns you must set `col_format="{}_new"`.
                Default: "{}".
            cols_not_found_error (:obj:`bool`): Raise an error if the isn't
                any match for any of the specified columns.
                Default: `False`.
        """

        if cols_init:
            args = (args, cols_init)
        cols_init = flatten_list(args)

        # Set values.
        self.cols_init = cols_init
        self.exclude = flatten_list(exclude or [])
        self.dtype = dtype
        self.keep_original = keep_original
        self.name = name
        self.col_format = col_format
        self.col_map = None  # set in fit
        self.cols = None  # fitted columns, set in fit
        self.cols_out = None  # set in fit
        self.cols_not_found_error = cols_not_found_error
        self.cols_in_out = None  # set in fit