def train(self, X: pd.DataFrame, y: np.ndarray, **kwargs): # Todo:Rapids numpy """ Parameters: X: pandas dataFrame, dataset[-taget] y: numpy array, target Returns: self """ # currently support only one learner, very brittle parser if self.kind == {}: raise_PasoError( "keyword kind must be present at top level:{}:".format( self.kind ) ) if self.kind_name not in NameToClass.__learners__: raise_PasoError( "Train; no operation named: {} in learners;: {}".format( self.kind_name, NameToClass.__learners__.keys() ) ) else: self.model_name = self.kind_name self.model = NameToClass.__learners__[self.kind_name]( **self.kind_name_kwargs ) self.model.fit(X, y) self.model_type = self.type self.trained = True return self
def _parse_metrics(self, y: np.ndarray, **kwargs): """" pasre the metics dict Parameters: y: (numpy vector ) target or dependent feature of dataset. """ self.n_class = len(np.unique(y)) self.class_names = _array_to_string(np.unique(y)) self.metrics = {} if self.model_type in NameToClass.__metrics__: self.metrics_names = [ k for k in NameToClass.__metrics__[self.model_type].keys() ] self.metrics_list = [ k for k in NameToClass.__metrics__[self.model_type].values() ] self.metrics_f = [ self.metrics_list[k][0] for k in range(len(self.metrics_names)) ] self.metrics_f_type = [ self.metrics_list[k][1] for k in range(len(self.metrics_names)) ] self.metrics_f_kwargs = [ self.metrics_list[k][2] for k in range(len(self.metrics_names)) ] self.metrics_needs_proba = [ self.metrics_list[k][3] for k in range(len(self.metrics_names)) ] # this list will be shorter if non-binary. # ok but misleading as only used by cv self.metrics_scoring = [ self.metrics_list[k][4] for k in range(len(self.metrics_names)) if self.metrics_list[k][4] and ( (self.metrics_f_type[k] == NameToClass.BINARY) and (self.n_class == NameToClass.NBINARY) or (self.metrics_f_type[k] != NameToClass.BINARY) and (self.n_class > NameToClass.NBINARY) ) ] else: raise_PasoError( "parse_metrics; no type named: {} not in : {}".format( self.model_type, NameToClass.__metrics__.keys() ) ) return self
def transform(self, X, **kwargs): # Todo:Rapids numpy """ Parameters: inplace: target: dependent feature which is "target" of trainer verbose: (boolean) (optiona) can be set. Default:True kind Returns: [train DF , test DF] SPLIT FROM X Raises: Note: """ # check keywords in passes argument stream # non-optional kw are initiated with None if self.target == None: raise_PasoError( "target not specified through keyword call or ontological file for EDA: {}".format( self ) ) if self.kwargs == None: raise_PasoError( "EDA_kwargs not specified for kind: {}".format(self) ) if _check_non_optional_kw( self.kind, msg="Splitter:transform target= non-optional: {}".format(self.target), ): self.y_train = X[self.target].values self.X_train = X[X.columns.difference([self.target])] self.clf = self.model # start with base learner for i, EDA_name in enumerate(self.EDA_kins): if self.verbose: logger.info("EDAer: {}".format(EDA_name)) EDA_kind__kwargs = self.EDA_kind_kwargs_list[i] if self.verbose: logger.info("EDAer:{} kwargs: {}".format(EDA_name,EDA_kind__kwargs)) self.EDAer = NameToClass.__EDAers__[EDA_name]() self.EDAer.transform( self.X_train, self.y_train,**EDA_kind__kwargs) return result
def _inputer_cvs(self, **kwargs): kw = "names" self.names = _dict_value(kwargs, kw, []) kw = "directory_path" self.directory_path = _dict_value(kwargs, kw, "") kw = "train" if kw in kwargs: self.train_path = self.directory_path + kwargs[kw] if os.path.exists(self.train_path) or _url_path_exists( self.train_path): if self.names != []: train = pd.read_csv(self.train_path, names=self.names) elif self.names == []: train = pd.read_csv(self.train_path) else: raise_PasoError( "Inputer train dataset path does not exist: {} or there might not be a directory_path:{}" .format(self.train_path, self.directory_path)) kw = "test" if kw in kwargs: self.test_path = self.directory_path + kwargs[kw] if os.path.exists(self.test_path): if self.names != []: test = pd.read_csv(self.test_path, names=self.names) elif self.names == []: test = pd.read_csv(self.test_path) else: raise_PasoError( "Inputer test dataset path does not exist: {}".format( self.test_path)) kw = "sampleSubmission" if kw in kwargs: self.sampleSubmission_path = self.directory_path + kwargs[kw] if os.path.exists(self.sampleSubmission_path): if self.names != []: sampleSubmission = pd.read_csv(self.sampleSubmission_path, names=self.names) elif self.names == []: sampleSubmission = pd.read_csv(self.sampleSubmission_path) else: raise_PasoError( "Inputer sampleSubmission dataset path does not exist: {}". format(self.test_path)) # no case in python if self.dataset == "train": return train elif self.dataset == "valid": return valid elif self.dataset == "test": return test elif self.dataset == "sampleSubmission": return sampleSubmission else: raise_PasoError("dataset not recognized: {} ".format(self.dataset))
def transform(self, *args, **kwargs): # Todo:Rapids numpy """" main method to input file or url, or an error will be raised. parameters: None keywords: input_path: (str) the data source source path name. The path can be url or local. Format must be csv or csv/zip. target: the dependent feature name of this data_set. drop: (list) list of feature names to drop from dataset, X,y are then extracted from dataset. attributes set: self.target: (str) self.input_path = input_path returns: dataset: (DataFrame) complete dataset input from data source. """ # currently support only one inputer, very brittle parser kwa = "target" self.target = _dict_value(self.kind_name_kwargs, kwa, None) _check_non_optional_kw( kwa, "Inputer: needs target keyword. probably not set in ontological file." ) # currently just can only be in inputer/transformkwarg kwa = "dataset" self.dataset = _dict_value(kwargs, kwa, "train") # create instance of this particular learner # checks for non-optional keyword if self.kind_name not in Inputers._inputer_: raise_PasoError( "transform; no format named: {} not in Inputers;: {}".format( self.kind_name, Inputers._inputer_.keys())) if _formats_supported(self.description_filepath): self.input_data_set = True return Inputers._inputer_[self.kind_name](self, **self.kind_name_kwargs)
def predict_proba(self, X: pd.DataFrame)-> np.ndarray: """ parameters: X: (DataFrame\) column(s) are independent features of dataset Returns: y_pred: (numpy vector of ints) predicted target probability Warning: Assumes train has been called. """ # enforce order of method calls if not self.trained: raise_PasoError("Must be ['fit'] before predict_proba. ") self.predicted_proba = True return self.model.predict_proba(X)
def predict(self, X:pd.DataFrame) -> np.ndarray: """ parameters: X: DataFrame column(s) are independent features of dataset Returns: y_pred: numpy array, predicted target. Warning: Assumes train has been called. """ # enforce order of method calls if not self.trained: raise_PasoError("Must be 'fit' before predict. ") self.predicted = True return self.model.predict(X)
def __init__(self, encoderKey, verbose=False, *args, **kwargs): super().__init__() if encoderKey in __ScalerDict__: Encoder = __ScalerDict__[encoderKey](*args) else: raise raise_PasoError( "paso:scale: No scaler named: {} found.".format(encoderKey)) self.encoderKey = encoderKey self.model = Encoder validate_bool_kwarg(verbose, "verbose") self.verbose = verbose
def _inputer_exec(self, **kwargs): # must always be data = ' train' or no dataset = if self.dataset != "train" and ("train" not in kwargs): raise_PasoError("dataset='{}' not recognized: in {} ".format( self.dataset, kwargs)) key = ["pre", "post"] if key[0] in kwargs and kwargs[key[0]] != None: for stmt in kwargs[key[0]]: exec(stmt) dfkey = "create-df" if dfkey in kwargs and kwargs[dfkey] != None: result = eval(kwargs[dfkey]) if key[1] in kwargs and kwargs[key[1]] != "None": for stmt in kwargs[key[1]]: exec(stmt) return result
def inverse_predict(self, Xarg, inplace=False, **kwargs): """ Args: Xarg (array-like): Predictions of different models for the labels. Returns: (DataFrame): inverse of Xarg """ X = Xarg.values if self.trained and self.predicted: X = self.model.inverse_transform(X) if self.verbose: logger.info("Scaler:inverse_transform:{}".format( self.encoderKey)) return toDataFrame().transform(X, labels=Xarg.columns, inplace=False) else: raise raise_PasoError( "scale:inverse_transform: must call train and predict before inverse" )
def tune_hyperparameters(self, X, y, **kwargs): # Todo:Rapids numpy """ Parameters: X: pandas dataFrame Returns: self """ if self.tune_description_filepath == "": raise_PasoError( "tune_hyperparameters-validation requires cv_description_filepath=\<fp\> keyword" ) if self.kind == {}: raise_PasoError( "keyword kind=\<tune_hyperparameters_name\> must be present at top level:{}:".format( self.description_kwargs ) ) if self.kind_name not in NameToClass._hp_optimizeers__: raise_PasoError( "hp_optimizers; no operation named: {} not in : {}".format( self.kind_name, NameToClass.__hp_optimizers__.keys() ) ) else: self.hp_optimize_name = self.kind_name self.hp_optimize_model = NameToClass.__hp_optimizers__[self.kind_name]( **self.kind_name_kwargs ) self.hp_optimize_model.fit(X, y) self.hp_optimizee_model_type = self.type self.hp_optimized = True return self
def cross_validate(self, X: pd.DataFrame, y: np.ndarray, **kwargs) -> Dict: # Todo:Rapids numpy """ Parameters: X: (DataFrame\) column(s) are independent features of dataset y: (numpy vector ) target or dependent feature of dataset. cv_description_filepath: str Returns: d dict: statistics of dict of metrics ex: {'mean': {'fit_time': 0.47127442359924315, 'score_time': 0.6575253486633301, 'test_AOC': 0.9740188014101058, 'test_accuracy': 0.9740188014101058, 'test_f1_score': 0.9740188014101058, 'test_precision': 0.9740188014101058, 'test_recall': 0.9740188014101058, 'test_logloss': -0.08630526290291975}, 'median': {'fit_time': 0.4730620384, . . . 'var': {'fit_time': 1.0282657726747856e-05, 'score_time': 3.292675623924879e-06, 'test_AOC': 0.00014246376351316793, 'test_accuracy': 0.00014246376351316855, 'test_f1_score': 0.00014246376351316855, 'test_precision': 0.00014246376351316855, 'test_recall': 0.00014246376351316855, 'test_logloss': 0.0005022676227615732}} """ if self.cv_description_filepath == "": raise_PasoError( "Cross-validation requires description_filepath=\<fp\> keyword" ) if self.kind == {}: raise_PasoError( "keyword kind=\<cross_validaters_name\> must be present at top level:{}:".format( self.description_kwargs ) ) if self.kind_name not in NameToClass.__cross_validators__: raise_PasoError( "cross_validate; no operation named: {} __cross_validaters__ in cross_validaters;: {}".format( self.kind_name, NameToClass.__cross_validators__.keys() ) ) self._parse_metrics(y) self.cross_validate_name = self.kind_name self.cross_validate_model = NameToClass.__cross_validators__[self.kind_name] self.cv = self.kind_name_kwargs["cv"] scores = self.cross_validate_model( self.model, X, y, scoring=self.metrics_scoring, **self.kind_name_kwargs ) self.cv_metrics = sorted(scores.keys()) self.cross_validate_model_type = self.type self.cross_validated = True return _stat_arrays_in_dict(scores)
def toDataFrame(X: any, columns: list = [], verbose: bool = True) -> pd.DataFrame: """ Transform a list, tuple, csr_matrix, numpy 1-D or 2-D array, or pandas Series into a DataFrame. Parameters: X: dataset Keywords: labels: default: [] The column labels name to be used for new DataFrame. If number of column names given is less than number of column names needed, then they will generared as Column_0...Column_n, where n is the number of missing column names. verbose: True: output False: silent Raises: 1. ValueError will result of unknown argument type. 2. ValueError will result if labels is not a string or list of strings. Returns: dataset as pd.DataFrame Note: A best practice is to make your dataset of type ``DataFrame`` at the start of your pipeline and keep the original DataFrame thoughout the pipeline of your experimental run to maximize speed of completion and minimize memory usage, THIS IS NOT THREAD SAFE. Almost all **paso** objects call ``toDataFrame(argument)`` ,which if argument is of type ``DataFrame``is very about 500x faster, or about 2 ns for ``inplace=False`` for single thread for a 1,000,000x8 DataFrame. If input argument is of type DataFrame, the return will be passed DataFrame as if `inplace=True```and ignores ``labels`` If other than of type ``DataFrame`` then `inplace=False``, and `inplace`` is ignored and only remains for backwaeds compatability. based on 'willmcginnis' convert_input """ _fun_name = toDataFrame.__name__ if len(X) == 0: raise_PasoError("{} X:any is of length O: {} ".format( _fun_name, str(type(X)))) if not isinstance(X, pd.DataFrame): if isinstance(X, pd.Series): X = pd.DataFrame(X, copy=True) elif isinstance(X, list): # lists are always copied, but for consistency, we still pass the argument X = pd.DataFrame(X, copy=True) elif isinstance(X, (np.generic, np.ndarray)): if X.ndim != 2: raise_PasoError( "{} X (1st arg): wrong dimension. must be 2: was {} dim ". format(_fun_name, str(X.ndim))) X = pd.DataFrame(X, copy=True) elif isinstance(X, csr_matrix): X = pd.DataFrame(X.todense(), copy=True) else: raise_PasoError("{} Unexpected input type: %s".format( _fun_name, str(type(X)))) new_col_names = columns nc = X.shape[1] for i in range(len(columns), X.shape[1]): new_col_names.append("c_" + str(i)) X.columns = new_col_names if verbose: logger.info("{} with \ncolumn names: {}".format(_fun_name, X.columns)) return X
def toDatetimeComponents( oX: pd.DataFrame, drop: bool = True, components: list = [], prefix: bool = True, inplace: bool = True, verbose: bool = True, ) -> pd.DataFrame: # import pdb; pdb.set_trace() # debugging starts here """ Parameters: X: dataset Keywords: drop: default: True) If True then the datetime feature/column will be removed. components: Default: [] which results in all components list of column(feature) names for which datetime components are created. One or more of : [Year', 'Month', 'Week', 'Day','Dayofweek' , 'Dayofyear','Elapsed','Is_month_end' , 'Is_month_start', 'Is_quarter_end' , 'Is_quarter_start', 'Is_year_end', 'Is_year_start'] prefix: Default: True If True then the feature will be the prefix of the created datetime component fetures. The posfix will be _<component> to create the new feature column <feature>_<component>. if False only first _PREFIX_LENGTH_ characters of feature string eill be used to create the new feature name/column <featurename[0:2]>_<component>. verbose: Default True True: output False: silent inplace: Default: True True: replace 1st argument with resulting dataframe False: (boolean)change unplace the dataframe X Returns: pd.DataFrame transformed into datetime feature components Raises: 1. ValueError: if any dt_features = []. 2. ValueError: if any feature has NA values. Note: Successful coercion to ``datetime`` costs approximately 100x more than if X[[dt_features]] was already of type datetime. Because of cost, a possible date will **NOT** be converted to ``datetime`` type. -l 88 "$FilePath$" Another way, using a double negative is, if X[[dt_features]] is not of datetime type (such as ``object`` type) then there **IS NO** attempt to coerce X[[dt_features]] to ``datetime`` type is made. It is best if raw data field is read/input in as ``datetime`` rather than ``object``. Another way, is to convert dataframe column using. Assumes **paso** `data cleaning steps (such as removal of Null and NA values) have already been applied. """ _fun_name = toDatetimeComponents.__name__ # todo put in decorator if inplace: X = oX else: X = oX.copy() if components == []: components = [k for k in DatetimetoComponents.COMPONENT_DICT.keys()] if not isDataFrame(X): raise_PasoError("{} not passed DataFrame".format(_fun_name)) for feature in X.columns: _Check_No_NA_F_Values(X, feature) try: # object/srtr converted to dt, if possible Xt = X[feature].dtype if Xt == np.object: X[feature] = pd.to_datetime(X[feature]) # set new component feature name if prefix: fn = feature + "_" else: fn = feature[0:DatetimetoComponents._PREFIX_LENGTH_] + "_" for component in components: if component.lower() == "Elapsed".lower(): X[fn + "Elapsed"] = (X[feature].astype(np.int64) // 10**9).astype(np.int32) else: X[fn + component] = getattr( X[feature].dt, component.lower()) # ns to seconds if verbose: logger.info("datetime feature component added: {}".format( fn + component)) if drop: X.drop(feature, axis=1, inplace=True) if verbose: logger.info("datetime feature dropped: {}".format(feature)) except: pass # tryed but in dt format return X
def _formats_supported(path): for format in Inputers._formats_.keys(): if path.endswith(format): return format raise raise_PasoError("format of this file not supported: {}".format(path))