def __init__(self, columns, pattern, replace, result_columns=None, drop=True, func_desc=None, **kwargs): self._pattern = pattern self._replace = replace self._pattern_obj = re.compile(pattern) col_str = _list_str(columns) sfx = "s" if len(columns) > 1 else "" base_str = RegexReplace._BASE_STR.format(pattern, replace, sfx, col_str) super_kwargs = { 'columns': columns, 'func': RegexReplace.RegexReplacer(self._pattern_obj, self._replace), 'colbl_sfx': '_regex', 'result_columns': result_columns, 'drop': drop, 'exmsg': base_str + ApplyByCols._DEF_EXC_MSG_SUFFIX.format(sfx, col_str), 'appmsg': base_str + ApplyByCols._DEF_APP_MSG_SUFFIX, 'desc': base_str + ApplyByCols._DEF_DESCRIPTION_SUFFIX, } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns=None, exclude=None, drop=False, non_neg=False, const_shift=None, **kwargs): if columns is None: self._columns = None else: self._columns = _interpret_columns_param(columns) if exclude is None: self._exclude = [] else: self._exclude = _interpret_columns_param(exclude) self._drop = drop self._non_neg = non_neg self._const_shift = const_shift self._col_to_minval = {} col_str = "all numeric columns" if self._columns: col_str = _list_str(self._columns) super_kwargs = { 'exmsg': Log._DEF_LOG_EXC_MSG.format(col_str), 'appmsg': Log._DEF_LOG_APP_MSG.format(col_str), 'desc': "Log-transform {}".format(col_str) } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, min_len, max_len=None, result_columns=None, drop=True, **kwargs): self._min_len = min_len self._max_len = max_len col_str = _list_str(columns) sfx = "s" if len(columns) > 1 else "" token_filter = DropTokensByLength.MinLengthTokenFilter(min_len) cond_str = " > {}".format(min_len) if max_len: token_filter = DropTokensByLength.MinMaxLengthTokenFilter( min_len=min_len, max_len=max_len) cond_str += " < {}".format(max_len) base_str = DropTokensByLength._BASE_STR.format(cond_str, sfx, col_str) super_kwargs = { "columns": columns, "func": token_filter, "colbl_sfx": "_filtered", "drop": drop, "exmsg": base_str + DropTokensByLength._DEF_EXC_MSG_SUFFIX, "appmsg": base_str + DropTokensByLength._DEF_APP_MSG_SUFFIX, "desc": base_str + DropTokensByLength._DEF_DESCRIPTION_SUFFIX, } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, stemmer_name, columns, drop=True, min_len=None, max_len=None, **kwargs): self.stemmer_name = stemmer_name self.stemmer = SnowballStem.__safe_stemmer_by_name(stemmer_name) self.list_stemmer = SnowballStem._TokenListStemmer( stemmer=self.stemmer, min_len=min_len, max_len=max_len) self._columns = _interpret_columns_param(columns) col_str = _list_str(self._columns) cond_str = '' if min_len: cond_str += f' of length >= {min_len}' if max_len: if not min_len: cond_str += ' of length' cond_str += f' <= {max_len}' desc = SnowballStem._DEF_STEM_DESC.format(cond_str, col_str) super_kwargs = { 'columns': columns, 'value_map': self.list_stemmer, 'drop': drop, 'suffix': '_stem', 'exmsg': SnowballStem._DEF_STEM_EXC_MSG.format(col_str), 'desc': desc, } super_kwargs.update(**kwargs) super_kwargs['none_columns'] = 'error' super().__init__(**super_kwargs)
def __init__(self, scaler, exclude_columns=None, exclude_object_columns=True, **kwargs): self.scaler = scaler if exclude_columns is None: self._exclude_columns = [] desc_suffix = "." else: self._exclude_columns = _interpret_columns_param(exclude_columns) col_str = _list_str(self._exclude_columns) desc_suffix = " except columns {}.".format(col_str) self._exclude_obj_cols = exclude_object_columns super_kwargs = { "exmsg": Scale._DEF_SCALE_EXC_MSG, "appmsg": Scale._DEF_SCALE_APP_MSG, "desc": Scale._DESC_PREFIX + desc_suffix, } self._kwargs = kwargs valid_super_kwargs = super()._init_kwargs() for key in kwargs: if key in valid_super_kwargs: super_kwargs[key] = kwargs[key] super().__init__(**super_kwargs)
def __init__(self, columns=None, dummy_na=False, exclude_columns=None, col_subset=False, drop_first=True, drop=True, **kwargs): if columns is None: self._columns = None else: self._columns = _interpret_columns_param(columns) self._dummy_na = dummy_na if exclude_columns is None: self._exclude_columns = [] else: self._exclude_columns = _interpret_columns_param(exclude_columns) self._col_subset = col_subset self._drop_first = drop_first self._drop = drop self._dummy_col_map = {} self._binarizer_map = {} col_str = _list_str(self._columns) super_kwargs = { 'exmsg': Binarize._DEF_BINAR_EXC_MSG.format(col_str), 'appmsg': Binarize._DEF_BINAR_APP_MSG.format(col_str or "all columns"), 'desc': "Binarize {}".format(col_str or "all categorical columns") } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns=None, dummy_na=False, exclude_columns=None, col_subset=False, drop_first=True, drop=True, **kwargs): if columns is None: self._columns = None else: self._columns = _interpret_columns_param(columns) self._dummy_na = dummy_na if exclude_columns is None: self._exclude_columns = [] else: self._exclude_columns = _interpret_columns_param(exclude_columns) self._col_subset = col_subset self._drop_first = drop_first self._drop = drop self._dummy_col_map = {} self._encoder_map = {} col_str = _list_str(self._columns) super_kwargs = { "exmsg": OneHotEncode._DEF_1HENCODE_EXC_MSG.format(col_str), "appmsg": OneHotEncode._DEF_1HENCODE_APP_MSG.format(col_str or "all columns"), "desc": "One-hot encode {}".format(col_str or "all categorical columns"), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, conditions, reduce=None, columns=None, **kwargs): self._conditions = conditions if reduce is None: reduce = 'any' self._reduce = reduce self._columns = None if columns: self._columns = _interpret_columns_param(columns) if reduce not in RowDrop._REDUCERS.keys(): raise ValueError(( "{} is an unsupported argument for the 'reduce' parameter of " "the RowDrop constructor!").format(reduce)) self._cond_is_dict = isinstance(conditions, dict) self._columns_str = "" if self._cond_is_dict: valid = all([callable(cond) for cond in conditions.values()]) if not valid: raise ValueError( "Condition dicts given to RowDrop must map to callables!") self._columns = list(conditions.keys()) self._columns_str = _list_str(self._columns) else: valid = all([callable(cond) for cond in conditions]) if not valid: raise ValueError( "RowDrop condition lists can contain only callables!") self._row_cond = self._row_condition_builder(conditions, reduce) super_kwargs = { 'exmsg': RowDrop._DEF_ROWDROP_EXC_MSG.format(self._columns_str), 'appmsg': RowDrop._DEF_ROWDROP_APPLY_MSG.format(self._columns_str), 'desc': self._default_desc() } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, func, result_columns=None, drop=True, func_desc=None, **kwargs): self._columns = _interpret_columns_param(columns) self._func = func if result_columns is None: if drop: self._result_columns = self._columns else: self._result_columns = [col + '_app' for col in self._columns] else: self._result_columns = _interpret_columns_param(result_columns) if len(self._result_columns) != len(self._columns): raise ValueError("columns and result_columns parameters must" " be string lists of the same length!") self._drop = drop if func_desc is None: func_desc = "" self._func_desc = func_desc col_str = _list_str(self._columns) sfx = 's' if len(self._columns) > 1 else '' base_str = ApplyByCols._BASE_STR.format(self._func_desc, sfx, col_str) super_kwargs = { 'exmsg': base_str + ApplyByCols._DEF_EXC_MSG_SUFFIX, 'appmsg': base_str + ApplyByCols._DEF_APP_MSG_SUFFIX, 'desc': base_str + ApplyByCols._DEF_DESCRIPTION_SUFFIX } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, value_map, result_columns=None, drop=True, **kwargs): self._columns = _interpret_columns_param(columns, 'columns') self._value_map = value_map if result_columns is None: if drop: self._result_columns = self._columns else: self._result_columns = [col + '_map' for col in self._columns] else: self._result_columns = _interpret_columns_param( result_columns, 'result_columns') if len(self._result_columns) != len(self._columns): raise ValueError("columns and result_columns parameters must" " be string lists of the same length!") col_str = _list_str(self._columns) sfx = 's' if len(self._columns) > 1 else '' self._drop = drop super_kwargs = { 'exmsg': MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str), 'appmsg': MapColVals._DEF_MAP_COLVAL_APP_MSG.format(sfx, col_str, self._value_map), 'desc': "Map values of column{} {} with {}.".format( sfx, col_str, self._value_map) } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, values, columns=None, **kwargs): self._values = values self._values_str = _list_str(self._values) self._columns_str = _list_str(columns) if columns is None: self._columns = None apply_msg = ValKeep._DEF_VALKEEP_APPLY_MSG.format(self._values_str) else: self._columns = _interpret_columns_param(columns) apply_msg = ValKeep._DEF_VALKEEP_APPLY_MSG.format( "{} in {}".format(self._values_str, self._columns_str)) super_kwargs = { 'exmsg': ValKeep._DEF_VALKEEP_EXC_MSG.format(self._columns_str), 'appmsg': apply_msg, 'desc': self._default_desc() } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, rename_map, **kwargs): self._rename_map = rename_map columns_str = _list_str(list(rename_map.keys())) suffix = 's' if len(rename_map) > 1 else '' super_kwargs = { 'exmsg': ColRename._DEF_COLDRENAME_EXC_MSG.format(columns_str), 'desc': f"Rename column{suffix} with {self._rename_map}", } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, values, columns=None, **kwargs): self._values = values self._values_str = _list_str(self._values) super_kwargs = { 'columns': columns, 'desc_temp': f'Drop values {self._values_str} in columns {{}}', } super_kwargs.update(**kwargs) super_kwargs['none_columns'] = 'all' super().__init__(**super_kwargs)
def __init__(self, bin_map, drop=True, **kwargs): self._bin_map = bin_map self._drop = drop columns_str = _list_str(list(bin_map.keys())) super_kwargs = { "exmsg": Bin._DEF_BIN_EXC_MSG.format(columns_str), "desc": self._default_desc(), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, **kwargs): self._columns = columns self._columns_str = _list_str(self._columns) if not callable(columns): self._columns = _interpret_columns_param(columns, 'columns') super_kwargs = { 'exmsg': ColDrop._DEF_COLDROP_EXC_MSG.format(self._columns_str), 'appmsg': ColDrop._DEF_COLDROP_APPLY_MSG.format(self._columns_str), 'desc': self._default_desc() } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, threshold, drop=True, **kwargs): self._columns = _interpret_columns_param(columns) self._threshold = threshold self._drop = drop self._rare_removers = {} col_str = _list_str(self._columns) super_kwargs = { 'exmsg': DropRareTokens._DEF_RARE_EXC_MSG.format(col_str), 'appmsg': "Dropping rare tokens from {}...".format(col_str), 'desc': "Drop rare tokens from {}".format(col_str) } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, **kwargs): self._columns = _interpret_columns_param(columns) self._columns_str = _list_str(self._columns) desc = (f"Transform input dataframes to the following schema: " f"{self._columns_str}") exmsg = (f"Not all required columns {self._columns_str} " f"found in input dataframe!") super_kwargs = { 'exmsg': exmsg, 'desc': desc, } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, drop=True, **kwargs): self._columns = _interpret_columns_param(columns) col_str = _list_str(self._columns) super_kwargs = { 'columns': columns, 'value_map': UntokenizeText._untokenize_list, 'drop': drop, 'suffix': '_untok', 'exmsg': UntokenizeText._DEF_UNTOKENIZE_EXC_MSG.format(col_str), 'desc': f"Untokenize {col_str}", } super_kwargs.update(**kwargs) super_kwargs['none_columns'] = 'error' super().__init__(**super_kwargs)
def __init__(self, columns, drop=True, **kwargs): self._columns = _interpret_columns_param(columns) col_str = _list_str(self._columns) super_kwargs = { 'columns': columns, 'value_map': UntokenizeWords._untokenize_list, 'drop': drop, 'suffix': '_untok', 'exmsg': UntokenizeWords._DEF_UNTOKENIZE_EXC_MSG.format(col_str), 'appmsg': "Untokenizing {}".format(col_str), 'desc': "Untokenize {}".format(col_str), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, bin_map, drop=True, **kwargs): self._bin_map = bin_map self._drop = drop columns_str = _list_str(list(bin_map.keys())) super_kwargs = { 'exmsg': Bin._DEF_BIN_EXC_MSG.format(columns_str), 'appmsg': Bin._DEF_BIN_APP_MSG.format('s' if len(bin_map) > 1 else '', columns_str), 'desc': self._default_desc() } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, drop=True, **kwargs): self.__check_punkt() self._columns = _interpret_columns_param(columns) col_str = _list_str(self._columns) super_kwargs = { 'columns': columns, 'value_map': nltk.word_tokenize, 'drop': drop, 'suffix': '_tok', 'exmsg': TokenizeWords._DEF_TOKENIZE_EXC_MSG.format(col_str), 'appmsg': TokenizeWords._DEF_TOKENIZE_APP_MSG.format(col_str), 'desc': "Tokenize {}".format(col_str), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, stemmer_name, columns, drop=True, **kwargs): self.stemmer_name = stemmer_name self.stemmer = SnowballStem.__safe_stemmer_by_name(stemmer_name) self.list_stemmer = SnowballStem._TokenListStemmer(self.stemmer) self._columns = _interpret_columns_param(columns) col_str = _list_str(self._columns) super_kwargs = { 'columns': columns, 'value_map': self.list_stemmer, 'drop': drop, 'suffix': '_stem', 'exmsg': SnowballStem._DEF_STEM_EXC_MSG.format(col_str), 'appmsg': SnowballStem._DEF_STEM_APP_MSG.format(col_str), 'desc': "Stem tokens in {}".format(col_str), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__( self, columns, value_map, result_columns=None, drop=True, suffix=None, **kwargs ): self._columns = _interpret_columns_param(columns) self._value_map = value_map if suffix is None: suffix = "_map" self.suffix = suffix if result_columns is None: if drop: self._result_columns = self._columns else: self._result_columns = [ col + self.suffix for col in self._columns ] else: self._result_columns = _interpret_columns_param(result_columns) if len(self._result_columns) != len(self._columns): raise ValueError( "columns and result_columns parameters must" " be string lists of the same length!" ) col_str = _list_str(self._columns) sfx = "s" if len(self._columns) > 1 else "" self._drop = drop super_kwargs = { "exmsg": MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str), "appmsg": MapColVals._DEF_MAP_COLVAL_APP_MSG.format( sfx, col_str, self._value_map ), "desc": "Map values of column{} {} with {}.".format( sfx, col_str, self._value_map ), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns=None, exclude_columns=None, drop=True, **kwargs): if columns is None: self._columns = None else: self._columns = _interpret_columns_param(columns) if exclude_columns is None: self._exclude_columns = [] else: self._exclude_columns = _interpret_columns_param(exclude_columns) self._drop = drop self.encoders = {} col_str = _list_str(self._columns) super_kwargs = { 'exmsg': Encode._DEF_ENCODE_EXC_MSG.format(col_str), 'appmsg': Encode._DEF_ENCODE_APP_MSG.format(col_str), 'desc': "Encode {}".format(col_str or "all categorical columns") } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__( self, columns, func, result_columns=None, drop=True, func_desc=None, suffix=None, **kwargs ): if suffix is None: suffix = AggByCols._DEF_COLNAME_SUFFIX self._suffix = suffix self._columns = _interpret_columns_param(columns) self._func = func if result_columns is None: if drop: self._result_columns = self._columns else: self._result_columns = [col + suffix for col in self._columns] else: self._result_columns = _interpret_columns_param(result_columns) if len(self._result_columns) != len(self._columns): raise ValueError( "columns and result_columns parameters must" " be string lists of the same length!" ) self._drop = drop if func_desc is None: func_desc = "" self._func_desc = func_desc col_str = _list_str(self._columns) sfx = "s" if len(self._columns) > 1 else "" base_str = ApplyByCols._BASE_STR.format(self._func_desc, sfx, col_str) super_kwargs = { "exmsg": base_str + ApplyByCols._DEF_EXC_MSG_SUFFIX, "appmsg": base_str + ApplyByCols._DEF_APP_MSG_SUFFIX, "desc": base_str + ApplyByCols._DEF_DESCRIPTION_SUFFIX, } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, language, columns, drop=True, **kwargs): self._language = language if isinstance(language, str): self._stopwords_list = RemoveStopwords.__stopwords_by_language( language) elif isinstance(language, collections.Iterable): self._stopwords_list = list(language) else: raise TypeError("language parameter should be string or list!") self._stopwords_remover = RemoveStopwords._StopwordsRemover( self._stopwords_list) self._columns = _interpret_columns_param(columns) col_str = _list_str(self._columns) super_kwargs = { 'columns': columns, 'value_map': self._stopwords_remover, 'drop': drop, 'suffix': '_nostop', 'exmsg': RemoveStopwords._DEF_STOPWORDS_EXC_MSG.format(col_str), 'appmsg': RemoveStopwords._DEF_STOPWORDS_APP_MSG.format(col_str), 'desc': "Removing stopwords from {}".format(col_str), } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def __init__(self, columns, bad_tokens, result_columns=None, drop=True, **kwargs): self._bad_tokens = bad_tokens col_str = _list_str(columns) sfx = "s" if len(columns) > 1 else "" cond_str = "" if len(bad_tokens) < 10: cond_str = "in list [" + " ".join(bad_tokens) + "]" base_str = DropTokensByList._BASE_STR.format(cond_str, sfx, col_str) super_kwargs = { "columns": columns, "func": DropTokensByList.ListTokenFilter(bad_tokens), "colbl_sfx": "_filtered", "drop": drop, "exmsg": base_str + DropTokensByList._DEF_EXC_MSG_SUFFIX, "appmsg": base_str + DropTokensByList._DEF_APP_MSG_SUFFIX, "desc": base_str + DropTokensByList._DEF_DESCRIPTION_SUFFIX, } super_kwargs.update(**kwargs) super().__init__(**super_kwargs)
def test_list_str(): assert _list_str(None) is None assert _list_str(['a', 'b']) == 'a, b' assert _list_str('a') == 'a' assert _list_str((1, 2)) == '1, 2' assert _list_str(5) == 5