Exemple #1
0
 def __init__(self,
              columns,
              pattern,
              replace,
              result_columns=None,
              drop=True,
              func_desc=None,
              **kwargs):
     self._pattern = pattern
     self._replace = replace
     self._pattern_obj = re.compile(pattern)
     col_str = _list_str(columns)
     sfx = "s" if len(columns) > 1 else ""
     base_str = RegexReplace._BASE_STR.format(pattern, replace, sfx,
                                              col_str)
     super_kwargs = {
         'columns':
         columns,
         'func':
         RegexReplace.RegexReplacer(self._pattern_obj, self._replace),
         'colbl_sfx':
         '_regex',
         'result_columns':
         result_columns,
         'drop':
         drop,
         'exmsg':
         base_str + ApplyByCols._DEF_EXC_MSG_SUFFIX.format(sfx, col_str),
         'appmsg':
         base_str + ApplyByCols._DEF_APP_MSG_SUFFIX,
         'desc':
         base_str + ApplyByCols._DEF_DESCRIPTION_SUFFIX,
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #2
0
 def __init__(self,
              columns=None,
              exclude=None,
              drop=False,
              non_neg=False,
              const_shift=None,
              **kwargs):
     if columns is None:
         self._columns = None
     else:
         self._columns = _interpret_columns_param(columns)
     if exclude is None:
         self._exclude = []
     else:
         self._exclude = _interpret_columns_param(exclude)
     self._drop = drop
     self._non_neg = non_neg
     self._const_shift = const_shift
     self._col_to_minval = {}
     col_str = "all numeric columns"
     if self._columns:
         col_str = _list_str(self._columns)
     super_kwargs = {
         'exmsg': Log._DEF_LOG_EXC_MSG.format(col_str),
         'appmsg': Log._DEF_LOG_APP_MSG.format(col_str),
         'desc': "Log-transform {}".format(col_str)
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #3
0
 def __init__(self,
              columns,
              min_len,
              max_len=None,
              result_columns=None,
              drop=True,
              **kwargs):
     self._min_len = min_len
     self._max_len = max_len
     col_str = _list_str(columns)
     sfx = "s" if len(columns) > 1 else ""
     token_filter = DropTokensByLength.MinLengthTokenFilter(min_len)
     cond_str = " > {}".format(min_len)
     if max_len:
         token_filter = DropTokensByLength.MinMaxLengthTokenFilter(
             min_len=min_len, max_len=max_len)
         cond_str += " < {}".format(max_len)
     base_str = DropTokensByLength._BASE_STR.format(cond_str, sfx, col_str)
     super_kwargs = {
         "columns": columns,
         "func": token_filter,
         "colbl_sfx": "_filtered",
         "drop": drop,
         "exmsg": base_str + DropTokensByLength._DEF_EXC_MSG_SUFFIX,
         "appmsg": base_str + DropTokensByLength._DEF_APP_MSG_SUFFIX,
         "desc": base_str + DropTokensByLength._DEF_DESCRIPTION_SUFFIX,
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #4
0
 def __init__(self, stemmer_name, columns, drop=True, min_len=None,
              max_len=None, **kwargs):
     self.stemmer_name = stemmer_name
     self.stemmer = SnowballStem.__safe_stemmer_by_name(stemmer_name)
     self.list_stemmer = SnowballStem._TokenListStemmer(
         stemmer=self.stemmer, min_len=min_len, max_len=max_len)
     self._columns = _interpret_columns_param(columns)
     col_str = _list_str(self._columns)
     cond_str = ''
     if min_len:
         cond_str += f' of length >= {min_len}'
     if max_len:
         if not min_len:
             cond_str += ' of length'
         cond_str += f' <= {max_len}'
     desc = SnowballStem._DEF_STEM_DESC.format(cond_str, col_str)
     super_kwargs = {
         'columns': columns,
         'value_map': self.list_stemmer,
         'drop': drop,
         'suffix': '_stem',
         'exmsg': SnowballStem._DEF_STEM_EXC_MSG.format(col_str),
         'desc': desc,
     }
     super_kwargs.update(**kwargs)
     super_kwargs['none_columns'] = 'error'
     super().__init__(**super_kwargs)
Exemple #5
0
 def __init__(self,
              scaler,
              exclude_columns=None,
              exclude_object_columns=True,
              **kwargs):
     self.scaler = scaler
     if exclude_columns is None:
         self._exclude_columns = []
         desc_suffix = "."
     else:
         self._exclude_columns = _interpret_columns_param(exclude_columns)
         col_str = _list_str(self._exclude_columns)
         desc_suffix = " except columns {}.".format(col_str)
     self._exclude_obj_cols = exclude_object_columns
     super_kwargs = {
         "exmsg": Scale._DEF_SCALE_EXC_MSG,
         "appmsg": Scale._DEF_SCALE_APP_MSG,
         "desc": Scale._DESC_PREFIX + desc_suffix,
     }
     self._kwargs = kwargs
     valid_super_kwargs = super()._init_kwargs()
     for key in kwargs:
         if key in valid_super_kwargs:
             super_kwargs[key] = kwargs[key]
     super().__init__(**super_kwargs)
Exemple #6
0
 def __init__(self,
              columns=None,
              dummy_na=False,
              exclude_columns=None,
              col_subset=False,
              drop_first=True,
              drop=True,
              **kwargs):
     if columns is None:
         self._columns = None
     else:
         self._columns = _interpret_columns_param(columns)
     self._dummy_na = dummy_na
     if exclude_columns is None:
         self._exclude_columns = []
     else:
         self._exclude_columns = _interpret_columns_param(exclude_columns)
     self._col_subset = col_subset
     self._drop_first = drop_first
     self._drop = drop
     self._dummy_col_map = {}
     self._binarizer_map = {}
     col_str = _list_str(self._columns)
     super_kwargs = {
         'exmsg': Binarize._DEF_BINAR_EXC_MSG.format(col_str),
         'appmsg': Binarize._DEF_BINAR_APP_MSG.format(col_str
                                                      or "all columns"),
         'desc': "Binarize {}".format(col_str or "all categorical columns")
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #7
0
 def __init__(self,
              columns=None,
              dummy_na=False,
              exclude_columns=None,
              col_subset=False,
              drop_first=True,
              drop=True,
              **kwargs):
     if columns is None:
         self._columns = None
     else:
         self._columns = _interpret_columns_param(columns)
     self._dummy_na = dummy_na
     if exclude_columns is None:
         self._exclude_columns = []
     else:
         self._exclude_columns = _interpret_columns_param(exclude_columns)
     self._col_subset = col_subset
     self._drop_first = drop_first
     self._drop = drop
     self._dummy_col_map = {}
     self._encoder_map = {}
     col_str = _list_str(self._columns)
     super_kwargs = {
         "exmsg":
         OneHotEncode._DEF_1HENCODE_EXC_MSG.format(col_str),
         "appmsg":
         OneHotEncode._DEF_1HENCODE_APP_MSG.format(col_str
                                                   or "all columns"),
         "desc":
         "One-hot encode {}".format(col_str or "all categorical columns"),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #8
0
 def __init__(self, conditions, reduce=None, columns=None, **kwargs):
     self._conditions = conditions
     if reduce is None:
         reduce = 'any'
     self._reduce = reduce
     self._columns = None
     if columns:
         self._columns = _interpret_columns_param(columns)
     if reduce not in RowDrop._REDUCERS.keys():
         raise ValueError((
             "{} is an unsupported argument for the 'reduce' parameter of "
             "the RowDrop constructor!").format(reduce))
     self._cond_is_dict = isinstance(conditions, dict)
     self._columns_str = ""
     if self._cond_is_dict:
         valid = all([callable(cond) for cond in conditions.values()])
         if not valid:
             raise ValueError(
                 "Condition dicts given to RowDrop must map to callables!")
         self._columns = list(conditions.keys())
         self._columns_str = _list_str(self._columns)
     else:
         valid = all([callable(cond) for cond in conditions])
         if not valid:
             raise ValueError(
                 "RowDrop condition lists can contain only callables!")
     self._row_cond = self._row_condition_builder(conditions, reduce)
     super_kwargs = {
         'exmsg': RowDrop._DEF_ROWDROP_EXC_MSG.format(self._columns_str),
         'appmsg': RowDrop._DEF_ROWDROP_APPLY_MSG.format(self._columns_str),
         'desc': self._default_desc()
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #9
0
 def __init__(self,
              columns,
              func,
              result_columns=None,
              drop=True,
              func_desc=None,
              **kwargs):
     self._columns = _interpret_columns_param(columns)
     self._func = func
     if result_columns is None:
         if drop:
             self._result_columns = self._columns
         else:
             self._result_columns = [col + '_app' for col in self._columns]
     else:
         self._result_columns = _interpret_columns_param(result_columns)
         if len(self._result_columns) != len(self._columns):
             raise ValueError("columns and result_columns parameters must"
                              " be string lists of the same length!")
     self._drop = drop
     if func_desc is None:
         func_desc = ""
     self._func_desc = func_desc
     col_str = _list_str(self._columns)
     sfx = 's' if len(self._columns) > 1 else ''
     base_str = ApplyByCols._BASE_STR.format(self._func_desc, sfx, col_str)
     super_kwargs = {
         'exmsg': base_str + ApplyByCols._DEF_EXC_MSG_SUFFIX,
         'appmsg': base_str + ApplyByCols._DEF_APP_MSG_SUFFIX,
         'desc': base_str + ApplyByCols._DEF_DESCRIPTION_SUFFIX
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #10
0
 def __init__(self,
              columns,
              value_map,
              result_columns=None,
              drop=True,
              **kwargs):
     self._columns = _interpret_columns_param(columns, 'columns')
     self._value_map = value_map
     if result_columns is None:
         if drop:
             self._result_columns = self._columns
         else:
             self._result_columns = [col + '_map' for col in self._columns]
     else:
         self._result_columns = _interpret_columns_param(
             result_columns, 'result_columns')
         if len(self._result_columns) != len(self._columns):
             raise ValueError("columns and result_columns parameters must"
                              " be string lists of the same length!")
     col_str = _list_str(self._columns)
     sfx = 's' if len(self._columns) > 1 else ''
     self._drop = drop
     super_kwargs = {
         'exmsg':
         MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str),
         'appmsg':
         MapColVals._DEF_MAP_COLVAL_APP_MSG.format(sfx, col_str,
                                                   self._value_map),
         'desc':
         "Map values of column{} {} with {}.".format(
             sfx, col_str, self._value_map)
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
 def __init__(self, values, columns=None, **kwargs):
     self._values = values
     self._values_str = _list_str(self._values)
     self._columns_str = _list_str(columns)
     if columns is None:
         self._columns = None
         apply_msg = ValKeep._DEF_VALKEEP_APPLY_MSG.format(self._values_str)
     else:
         self._columns = _interpret_columns_param(columns)
         apply_msg = ValKeep._DEF_VALKEEP_APPLY_MSG.format(
             "{} in {}".format(self._values_str, self._columns_str))
     super_kwargs = {
         'exmsg': ValKeep._DEF_VALKEEP_EXC_MSG.format(self._columns_str),
         'appmsg': apply_msg,
         'desc': self._default_desc()
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #12
0
 def __init__(self, rename_map, **kwargs):
     self._rename_map = rename_map
     columns_str = _list_str(list(rename_map.keys()))
     suffix = 's' if len(rename_map) > 1 else ''
     super_kwargs = {
         'exmsg': ColRename._DEF_COLDRENAME_EXC_MSG.format(columns_str),
         'desc': f"Rename column{suffix} with {self._rename_map}",
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #13
0
 def __init__(self, values, columns=None, **kwargs):
     self._values = values
     self._values_str = _list_str(self._values)
     super_kwargs = {
         'columns': columns,
         'desc_temp': f'Drop values {self._values_str} in columns {{}}',
     }
     super_kwargs.update(**kwargs)
     super_kwargs['none_columns'] = 'all'
     super().__init__(**super_kwargs)
Exemple #14
0
 def __init__(self, bin_map, drop=True, **kwargs):
     self._bin_map = bin_map
     self._drop = drop
     columns_str = _list_str(list(bin_map.keys()))
     super_kwargs = {
         "exmsg": Bin._DEF_BIN_EXC_MSG.format(columns_str),
         "desc": self._default_desc(),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #15
0
 def __init__(self, columns, **kwargs):
     self._columns = columns
     self._columns_str = _list_str(self._columns)
     if not callable(columns):
         self._columns = _interpret_columns_param(columns, 'columns')
     super_kwargs = {
         'exmsg': ColDrop._DEF_COLDROP_EXC_MSG.format(self._columns_str),
         'appmsg': ColDrop._DEF_COLDROP_APPLY_MSG.format(self._columns_str),
         'desc': self._default_desc()
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
 def __init__(self, columns, threshold, drop=True, **kwargs):
     self._columns = _interpret_columns_param(columns)
     self._threshold = threshold
     self._drop = drop
     self._rare_removers = {}
     col_str = _list_str(self._columns)
     super_kwargs = {
         'exmsg': DropRareTokens._DEF_RARE_EXC_MSG.format(col_str),
         'appmsg': "Dropping rare tokens from {}...".format(col_str),
         'desc': "Drop rare tokens from {}".format(col_str)
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #17
0
 def __init__(self, columns, **kwargs):
     self._columns = _interpret_columns_param(columns)
     self._columns_str = _list_str(self._columns)
     desc = (f"Transform input dataframes to the following schema: "
             f"{self._columns_str}")
     exmsg = (f"Not all required columns {self._columns_str} "
              f"found in input dataframe!")
     super_kwargs = {
         'exmsg': exmsg,
         'desc': desc,
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #18
0
 def __init__(self, columns, drop=True, **kwargs):
     self._columns = _interpret_columns_param(columns)
     col_str = _list_str(self._columns)
     super_kwargs = {
         'columns': columns,
         'value_map': UntokenizeText._untokenize_list,
         'drop': drop,
         'suffix': '_untok',
         'exmsg': UntokenizeText._DEF_UNTOKENIZE_EXC_MSG.format(col_str),
         'desc': f"Untokenize {col_str}",
     }
     super_kwargs.update(**kwargs)
     super_kwargs['none_columns'] = 'error'
     super().__init__(**super_kwargs)
 def __init__(self, columns, drop=True, **kwargs):
     self._columns = _interpret_columns_param(columns)
     col_str = _list_str(self._columns)
     super_kwargs = {
         'columns': columns,
         'value_map': UntokenizeWords._untokenize_list,
         'drop': drop,
         'suffix': '_untok',
         'exmsg': UntokenizeWords._DEF_UNTOKENIZE_EXC_MSG.format(col_str),
         'appmsg': "Untokenizing {}".format(col_str),
         'desc': "Untokenize {}".format(col_str),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #20
0
 def __init__(self, bin_map, drop=True, **kwargs):
     self._bin_map = bin_map
     self._drop = drop
     columns_str = _list_str(list(bin_map.keys()))
     super_kwargs = {
         'exmsg':
         Bin._DEF_BIN_EXC_MSG.format(columns_str),
         'appmsg':
         Bin._DEF_BIN_APP_MSG.format('s' if len(bin_map) > 1 else '',
                                     columns_str),
         'desc':
         self._default_desc()
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
 def __init__(self, columns, drop=True, **kwargs):
     self.__check_punkt()
     self._columns = _interpret_columns_param(columns)
     col_str = _list_str(self._columns)
     super_kwargs = {
         'columns': columns,
         'value_map': nltk.word_tokenize,
         'drop': drop,
         'suffix': '_tok',
         'exmsg': TokenizeWords._DEF_TOKENIZE_EXC_MSG.format(col_str),
         'appmsg': TokenizeWords._DEF_TOKENIZE_APP_MSG.format(col_str),
         'desc': "Tokenize {}".format(col_str),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
 def __init__(self, stemmer_name, columns, drop=True, **kwargs):
     self.stemmer_name = stemmer_name
     self.stemmer = SnowballStem.__safe_stemmer_by_name(stemmer_name)
     self.list_stemmer = SnowballStem._TokenListStemmer(self.stemmer)
     self._columns = _interpret_columns_param(columns)
     col_str = _list_str(self._columns)
     super_kwargs = {
         'columns': columns,
         'value_map': self.list_stemmer,
         'drop': drop,
         'suffix': '_stem',
         'exmsg': SnowballStem._DEF_STEM_EXC_MSG.format(col_str),
         'appmsg': SnowballStem._DEF_STEM_APP_MSG.format(col_str),
         'desc': "Stem tokens in {}".format(col_str),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #23
0
 def __init__(
     self,
     columns,
     value_map,
     result_columns=None,
     drop=True,
     suffix=None,
     **kwargs
 ):
     self._columns = _interpret_columns_param(columns)
     self._value_map = value_map
     if suffix is None:
         suffix = "_map"
     self.suffix = suffix
     if result_columns is None:
         if drop:
             self._result_columns = self._columns
         else:
             self._result_columns = [
                 col + self.suffix for col in self._columns
             ]
     else:
         self._result_columns = _interpret_columns_param(result_columns)
         if len(self._result_columns) != len(self._columns):
             raise ValueError(
                 "columns and result_columns parameters must"
                 " be string lists of the same length!"
             )
     col_str = _list_str(self._columns)
     sfx = "s" if len(self._columns) > 1 else ""
     self._drop = drop
     super_kwargs = {
         "exmsg": MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str),
         "appmsg": MapColVals._DEF_MAP_COLVAL_APP_MSG.format(
             sfx, col_str, self._value_map
         ),
         "desc": "Map values of column{} {} with {}.".format(
             sfx, col_str, self._value_map
         ),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #24
0
 def __init__(self, columns=None, exclude_columns=None, drop=True,
              **kwargs):
     if columns is None:
         self._columns = None
     else:
         self._columns = _interpret_columns_param(columns)
     if exclude_columns is None:
         self._exclude_columns = []
     else:
         self._exclude_columns = _interpret_columns_param(exclude_columns)
     self._drop = drop
     self.encoders = {}
     col_str = _list_str(self._columns)
     super_kwargs = {
         'exmsg': Encode._DEF_ENCODE_EXC_MSG.format(col_str),
         'appmsg': Encode._DEF_ENCODE_APP_MSG.format(col_str),
         'desc': "Encode {}".format(col_str or "all categorical columns")
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #25
0
 def __init__(
     self,
     columns,
     func,
     result_columns=None,
     drop=True,
     func_desc=None,
     suffix=None,
     **kwargs
 ):
     if suffix is None:
         suffix = AggByCols._DEF_COLNAME_SUFFIX
     self._suffix = suffix
     self._columns = _interpret_columns_param(columns)
     self._func = func
     if result_columns is None:
         if drop:
             self._result_columns = self._columns
         else:
             self._result_columns = [col + suffix for col in self._columns]
     else:
         self._result_columns = _interpret_columns_param(result_columns)
         if len(self._result_columns) != len(self._columns):
             raise ValueError(
                 "columns and result_columns parameters must"
                 " be string lists of the same length!"
             )
     self._drop = drop
     if func_desc is None:
         func_desc = ""
     self._func_desc = func_desc
     col_str = _list_str(self._columns)
     sfx = "s" if len(self._columns) > 1 else ""
     base_str = ApplyByCols._BASE_STR.format(self._func_desc, sfx, col_str)
     super_kwargs = {
         "exmsg": base_str + ApplyByCols._DEF_EXC_MSG_SUFFIX,
         "appmsg": base_str + ApplyByCols._DEF_APP_MSG_SUFFIX,
         "desc": base_str + ApplyByCols._DEF_DESCRIPTION_SUFFIX,
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
 def __init__(self, language, columns, drop=True, **kwargs):
     self._language = language
     if isinstance(language, str):
         self._stopwords_list = RemoveStopwords.__stopwords_by_language(
             language)
     elif isinstance(language, collections.Iterable):
         self._stopwords_list = list(language)
     else:
         raise TypeError("language parameter should be string or list!")
     self._stopwords_remover = RemoveStopwords._StopwordsRemover(
         self._stopwords_list)
     self._columns = _interpret_columns_param(columns)
     col_str = _list_str(self._columns)
     super_kwargs = {
         'columns': columns,
         'value_map': self._stopwords_remover,
         'drop': drop,
         'suffix': '_nostop',
         'exmsg': RemoveStopwords._DEF_STOPWORDS_EXC_MSG.format(col_str),
         'appmsg': RemoveStopwords._DEF_STOPWORDS_APP_MSG.format(col_str),
         'desc': "Removing stopwords from {}".format(col_str),
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #27
0
 def __init__(self,
              columns,
              bad_tokens,
              result_columns=None,
              drop=True,
              **kwargs):
     self._bad_tokens = bad_tokens
     col_str = _list_str(columns)
     sfx = "s" if len(columns) > 1 else ""
     cond_str = ""
     if len(bad_tokens) < 10:
         cond_str = "in list [" + " ".join(bad_tokens) + "]"
     base_str = DropTokensByList._BASE_STR.format(cond_str, sfx, col_str)
     super_kwargs = {
         "columns": columns,
         "func": DropTokensByList.ListTokenFilter(bad_tokens),
         "colbl_sfx": "_filtered",
         "drop": drop,
         "exmsg": base_str + DropTokensByList._DEF_EXC_MSG_SUFFIX,
         "appmsg": base_str + DropTokensByList._DEF_APP_MSG_SUFFIX,
         "desc": base_str + DropTokensByList._DEF_DESCRIPTION_SUFFIX,
     }
     super_kwargs.update(**kwargs)
     super().__init__(**super_kwargs)
Exemple #28
0
def test_list_str():
    assert _list_str(None) is None
    assert _list_str(['a', 'b']) == 'a, b'
    assert _list_str('a') == 'a'
    assert _list_str((1, 2)) == '1, 2'
    assert _list_str(5) == 5