Example #1
0
    def train(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
        # Todo:Rapids numpy
        """
        Parameters:
            X:  pandas dataFrame, dataset[-taget]
            y: numpy array, target
        Returns:
            self
        """
        # currently support only one learner, very brittle parser
        if self.kind == {}:
            raise_PasoError(
                "keyword kind must be present at top level:{}:".format(
                    self.kind
                )
            )

        if self.kind_name not in NameToClass.__learners__:
            raise_PasoError(
                "Train; no operation named: {} in learners;: {}".format(
                    self.kind_name, NameToClass.__learners__.keys()
                )
            )
        else:
            self.model_name = self.kind_name
            self.model = NameToClass.__learners__[self.kind_name](
                **self.kind_name_kwargs
            )
            self.model.fit(X, y)
            self.model_type = self.type

        self.trained = True

        return self
Example #2
0
    def _parse_metrics(self, y: np.ndarray, **kwargs):
        """"
        pasre the metics dict

        Parameters:

            y: (numpy vector )  target or dependent feature of dataset.

        """
        self.n_class = len(np.unique(y))
        self.class_names = _array_to_string(np.unique(y))

        self.metrics = {}
        if self.model_type in NameToClass.__metrics__:
            self.metrics_names = [
                k for k in NameToClass.__metrics__[self.model_type].keys()
            ]
            self.metrics_list = [
                k for k in NameToClass.__metrics__[self.model_type].values()
            ]
            self.metrics_f = [
                self.metrics_list[k][0] for k in range(len(self.metrics_names))
            ]
            self.metrics_f_type = [
                self.metrics_list[k][1] for k in range(len(self.metrics_names))
            ]
            self.metrics_f_kwargs = [
                self.metrics_list[k][2] for k in range(len(self.metrics_names))
            ]
            self.metrics_needs_proba = [
                self.metrics_list[k][3] for k in range(len(self.metrics_names))
            ]
            # this list will be shorter if non-binary.
            # ok but misleading as only used by cv
            self.metrics_scoring = [
                self.metrics_list[k][4]
                for k in range(len(self.metrics_names))
                if self.metrics_list[k][4]
                and (
                    (self.metrics_f_type[k] == NameToClass.BINARY)
                    and (self.n_class == NameToClass.NBINARY)
                    or (self.metrics_f_type[k] != NameToClass.BINARY)
                    and (self.n_class > NameToClass.NBINARY)
                )
            ]
        else:
            raise_PasoError(
                "parse_metrics; no type named: {} not in : {}".format(
                    self.model_type, NameToClass.__metrics__.keys()
                )
            )

        return self
Example #3
0
File: EDA.py Project: bcottman/paso
    def transform(self, X, **kwargs):
        # Todo:Rapids numpy
        """
        Parameters:
            inplace:
            target: dependent feature which is "target" of trainer
            verbose: (boolean) (optiona) can be set. Default:True

            kind

            Returns:
                    [train DF , test DF] SPLIT FROM X
            Raises:

            Note:
        """

        # check keywords in passes argument stream
        # non-optional kw are initiated with None

        if self.target == None:
            raise_PasoError(
                "target not specified through keyword call or ontological file for EDA: {}".format(
                    self
                )
            )

        if self.kwargs == None:
            raise_PasoError(
                "EDA_kwargs not specified for kind: {}".format(self)
            )

        if _check_non_optional_kw(
            self.kind,
            msg="Splitter:transform target= non-optional: {}".format(self.target),
        ):
        self.y_train = X[self.target].values
        self.X_train = X[X.columns.difference([self.target])]
        self.clf = self.model  # start with base learner
        for i, EDA_name in enumerate(self.EDA_kins):
            if self.verbose:
                logger.info("EDAer: {}".format(EDA_name))

            EDA_kind__kwargs = self.EDA_kind_kwargs_list[i]
            if self.verbose:
                logger.info("EDAer:{} kwargs: {}".format(EDA_name,EDA_kind__kwargs))
            self.EDAer = NameToClass.__EDAers__[EDA_name]()
            self.EDAer.transform( self.X_train, self.y_train,**EDA_kind__kwargs)

        return result
Example #4
0
def _inputer_cvs(self, **kwargs):
    kw = "names"
    self.names = _dict_value(kwargs, kw, [])

    kw = "directory_path"
    self.directory_path = _dict_value(kwargs, kw, "")

    kw = "train"
    if kw in kwargs:
        self.train_path = self.directory_path + kwargs[kw]
        if os.path.exists(self.train_path) or _url_path_exists(
                self.train_path):
            if self.names != []:
                train = pd.read_csv(self.train_path, names=self.names)
            elif self.names == []:
                train = pd.read_csv(self.train_path)
        else:
            raise_PasoError(
                "Inputer train dataset path does not exist: {} or there might not be a directory_path:{}"
                .format(self.train_path, self.directory_path))

    kw = "test"
    if kw in kwargs:
        self.test_path = self.directory_path + kwargs[kw]
        if os.path.exists(self.test_path):
            if self.names != []:
                test = pd.read_csv(self.test_path, names=self.names)
            elif self.names == []:
                test = pd.read_csv(self.test_path)
        else:
            raise_PasoError(
                "Inputer test dataset path does not exist: {}".format(
                    self.test_path))

    kw = "sampleSubmission"
    if kw in kwargs:
        self.sampleSubmission_path = self.directory_path + kwargs[kw]
        if os.path.exists(self.sampleSubmission_path):
            if self.names != []:
                sampleSubmission = pd.read_csv(self.sampleSubmission_path,
                                               names=self.names)
            elif self.names == []:
                sampleSubmission = pd.read_csv(self.sampleSubmission_path)
        else:
            raise_PasoError(
                "Inputer sampleSubmission dataset path does not exist: {}".
                format(self.test_path))

    # no case in python
    if self.dataset == "train":
        return train
    elif self.dataset == "valid":
        return valid
    elif self.dataset == "test":
        return test
    elif self.dataset == "sampleSubmission":
        return sampleSubmission
    else:
        raise_PasoError("dataset not recognized: {} ".format(self.dataset))
Example #5
0
    def transform(self, *args, **kwargs):
        # Todo:Rapids numpy
        """"
        main method to input file or url,
        or an error will be raised.

        parameters: None

        keywords:
            input_path: (str) the data source source path name.
                The path can be url or local. Format must be csv or csv/zip.

            target: the dependent feature name of this data_set.

            drop: (list) list of feature names to drop from
                dataset, X,y are then extracted from dataset.

        attributes set:
            self.target: (str)
            self.input_path = input_path

        returns:
            dataset: (DataFrame) complete dataset input from data source.
        """

        # currently support only one inputer, very brittle parser
        kwa = "target"
        self.target = _dict_value(self.kind_name_kwargs, kwa, None)
        _check_non_optional_kw(
            kwa,
            "Inputer: needs target keyword. probably not set in ontological file."
        )
        # currently just  can only be in inputer/transformkwarg
        kwa = "dataset"
        self.dataset = _dict_value(kwargs, kwa, "train")

        # create instance of this particular learner
        # checks for non-optional keyword
        if self.kind_name not in Inputers._inputer_:
            raise_PasoError(
                "transform; no format named: {} not in Inputers;: {}".format(
                    self.kind_name, Inputers._inputer_.keys()))

        if _formats_supported(self.description_filepath):
            self.input_data_set = True
            return Inputers._inputer_[self.kind_name](self,
                                                      **self.kind_name_kwargs)
Example #6
0
    def predict_proba(self, X: pd.DataFrame)-> np.ndarray:
        """
        parameters:
            X: (DataFrame\) column(s) are independent features of dataset

        Returns:
            y_pred: (numpy vector of ints)  predicted target probability

        Warning:
            Assumes train has been called.
        """
        # enforce order of method calls
        if not self.trained:
            raise_PasoError("Must be ['fit'] before predict_proba. ")

        self.predicted_proba = True
        return self.model.predict_proba(X)
Example #7
0
    def predict(self, X:pd.DataFrame) -> np.ndarray:
        """
        parameters:
            X: DataFrame column(s) are independent features of dataset

        Returns:
            y_pred: numpy array,  predicted target.

        Warning:
            Assumes train has been called.


        """
        # enforce order of method calls
        if not self.trained:
            raise_PasoError("Must be 'fit' before predict. ")
        self.predicted = True
        return self.model.predict(X)
Example #8
0
 def __init__(self, encoderKey, verbose=False, *args, **kwargs):
     super().__init__()
     if encoderKey in __ScalerDict__:
         Encoder = __ScalerDict__[encoderKey](*args)
     else:
         raise raise_PasoError(
             "paso:scale: No scaler named: {} found.".format(encoderKey))
     self.encoderKey = encoderKey
     self.model = Encoder
     validate_bool_kwarg(verbose, "verbose")
     self.verbose = verbose
Example #9
0
def _inputer_exec(self, **kwargs):

    # must always be data = ' train' or no dataset =
    if self.dataset != "train" and ("train" not in kwargs):
        raise_PasoError("dataset='{}'  not recognized: in  {} ".format(
            self.dataset, kwargs))

    key = ["pre", "post"]
    if key[0] in kwargs and kwargs[key[0]] != None:
        for stmt in kwargs[key[0]]:
            exec(stmt)

    dfkey = "create-df"
    if dfkey in kwargs and kwargs[dfkey] != None:
        result = eval(kwargs[dfkey])

    if key[1] in kwargs and kwargs[key[1]] != "None":
        for stmt in kwargs[key[1]]:
            exec(stmt)

    return result
Example #10
0
    def inverse_predict(self, Xarg, inplace=False, **kwargs):
        """
        Args:
            Xarg (array-like): Predictions of different models for the labels.

        Returns:
            (DataFrame): inverse of Xarg
        """
        X = Xarg.values
        if self.trained and self.predicted:
            X = self.model.inverse_transform(X)
            if self.verbose:
                logger.info("Scaler:inverse_transform:{}".format(
                    self.encoderKey))
            return toDataFrame().transform(X,
                                           labels=Xarg.columns,
                                           inplace=False)
        else:
            raise raise_PasoError(
                "scale:inverse_transform: must call train and predict before inverse"
            )
Example #11
0
    def tune_hyperparameters(self, X, y, **kwargs):
        # Todo:Rapids numpy
        """
        Parameters:
            X:  pandas dataFrame
        Returns:
            self
        """
        if self.tune_description_filepath == "":
            raise_PasoError(
                "tune_hyperparameters-validation requires cv_description_filepath=\<fp\> keyword"
            )

        if self.kind == {}:
            raise_PasoError(
                "keyword kind=\<tune_hyperparameters_name\> must be present at top level:{}:".format(
                    self.description_kwargs
                )
            )

        if self.kind_name not in NameToClass._hp_optimizeers__:
            raise_PasoError(
                "hp_optimizers; no operation named: {} not in : {}".format(
                    self.kind_name, NameToClass.__hp_optimizers__.keys()
                )
            )
        else:
            self.hp_optimize_name = self.kind_name
            self.hp_optimize_model = NameToClass.__hp_optimizers__[self.kind_name](
                **self.kind_name_kwargs
            )
            self.hp_optimize_model.fit(X, y)
            self.hp_optimizee_model_type = self.type

        self.hp_optimized = True

        return self
Example #12
0
    def cross_validate(self, X: pd.DataFrame, y: np.ndarray, **kwargs) -> Dict:
        # Todo:Rapids numpy
        """
        Parameters:
            X: (DataFrame\) column(s) are independent features of dataset
            y: (numpy vector )  target or dependent feature of dataset.
            cv_description_filepath:    str
        Returns: d
            dict: statistics of dict of metrics
            ex:
            {'mean': {'fit_time': 0.47127442359924315,
              'score_time': 0.6575253486633301,
              'test_AOC': 0.9740188014101058,
              'test_accuracy': 0.9740188014101058,
              'test_f1_score': 0.9740188014101058,
              'test_precision': 0.9740188014101058,
              'test_recall': 0.9740188014101058,
              'test_logloss': -0.08630526290291975},
             'median': {'fit_time': 0.4730620384,
             .
             .
             .
             'var': {'fit_time': 1.0282657726747856e-05,
              'score_time': 3.292675623924879e-06,
              'test_AOC': 0.00014246376351316793,
              'test_accuracy': 0.00014246376351316855,
              'test_f1_score': 0.00014246376351316855,
              'test_precision': 0.00014246376351316855,
              'test_recall': 0.00014246376351316855,
              'test_logloss': 0.0005022676227615732}}
        """
        if self.cv_description_filepath == "":
            raise_PasoError(
                "Cross-validation requires description_filepath=\<fp\> keyword"
            )

        if self.kind == {}:
            raise_PasoError(
                "keyword kind=\<cross_validaters_name\> must be present at top level:{}:".format(
                    self.description_kwargs
                )
            )

        if self.kind_name not in NameToClass.__cross_validators__:
            raise_PasoError(
                "cross_validate; no operation named: {} __cross_validaters__ in cross_validaters;: {}".format(
                    self.kind_name, NameToClass.__cross_validators__.keys()
                )
            )

        self._parse_metrics(y)

        self.cross_validate_name = self.kind_name
        self.cross_validate_model = NameToClass.__cross_validators__[self.kind_name]
        self.cv = self.kind_name_kwargs["cv"]
        scores = self.cross_validate_model(
            self.model, X, y, scoring=self.metrics_scoring, **self.kind_name_kwargs
        )
        self.cv_metrics = sorted(scores.keys())
        self.cross_validate_model_type = self.type

        self.cross_validated = True

        return _stat_arrays_in_dict(scores)
Example #13
0
def toDataFrame(X: any,
                columns: list = [],
                verbose: bool = True) -> pd.DataFrame:
    """
    Transform a list, tuple, csr_matrix, numpy 1-D or 2-D array, or pandas Series  into a  DataFrame.

    Parameters:
        X: dataset

    Keywords:

        labels:  default: []
            The column labels name to  be used for new DataFrame.
            If number of column names given is less than number of column names needed,
            then they will generared as Column_0...Column_n, where n is the number
            of missing column names.

        verbose:
            True: output
            False: silent

    Raises:
        1. ValueError will result of unknown argument type.
        2. ValueError will result if labels is not a string or list of strings.

    Returns: dataset as pd.DataFrame

    Note:
        A best practice is to make your dataset of type ``DataFrame`` at the start of your pipeline
        and keep the original DataFrame thoughout the pipeline of your experimental run to maximize
        speed of completion and minimize memory usage, THIS IS NOT THREAD SAFE.

        Almost all **paso** objects call ``toDataFrame(argument)`` ,which if argument
        is of type ``DataFrame``is very about 500x faster, or about 2 ns  for ``inplace=False``
        for single thread for a 1,000,000x8 DataFrame.

        If input argument is of type DataFrame,
        the return will be passed DataFrame as if `inplace=True```and ignores ``labels``

        If other than of type ``DataFrame`` then  `inplace=False``, and  `inplace`` is ignored
        and only remains for backwaeds compatability.

    based on 'willmcginnis' convert_input
    """
    _fun_name = toDataFrame.__name__

    if len(X) == 0:
        raise_PasoError("{} X:any is of length O: {} ".format(
            _fun_name, str(type(X))))
    if not isinstance(X, pd.DataFrame):
        if isinstance(X, pd.Series):
            X = pd.DataFrame(X, copy=True)
        elif isinstance(X, list):
            # lists are always copied, but for consistency, we still pass the argument
            X = pd.DataFrame(X, copy=True)
        elif isinstance(X, (np.generic, np.ndarray)):
            if X.ndim != 2:
                raise_PasoError(
                    "{} X (1st arg): wrong dimension. must be 2: was {} dim ".
                    format(_fun_name, str(X.ndim)))
            X = pd.DataFrame(X, copy=True)
        elif isinstance(X, csr_matrix):
            X = pd.DataFrame(X.todense(), copy=True)
        else:
            raise_PasoError("{} Unexpected input type: %s".format(
                _fun_name, str(type(X))))

        new_col_names = columns
        nc = X.shape[1]
        for i in range(len(columns), X.shape[1]):
            new_col_names.append("c_" + str(i))

        X.columns = new_col_names

    if verbose:
        logger.info("{}  with \ncolumn names: {}".format(_fun_name, X.columns))

    return X
Example #14
0
def toDatetimeComponents(
    oX: pd.DataFrame,
    drop: bool = True,
    components: list = [],
    prefix: bool = True,
    inplace: bool = True,
    verbose: bool = True,
) -> pd.DataFrame:
    #    import pdb; pdb.set_trace() # debugging starts here
    """
    Parameters:
        X: dataset

    Keywords:
        drop: default: True)
            If True then the datetime feature/column will be removed.

        components: Default: [] which results in all components
            list of column(feature) names for which datetime components
            are created.

            One or more of : [Year', 'Month', 'Week', 'Day','Dayofweek'
            , 'Dayofyear','Elapsed','Is_month_end'
            , 'Is_month_start', 'Is_quarter_end'
            , 'Is_quarter_start', 'Is_year_end', 'Is_year_start']

        prefix: Default: True
            If True then the feature will be the prefix of the created datetime
            component fetures. The posfix will be _<component> to create the new
            feature column <feature>_<component>.

            if False only first _PREFIX_LENGTH_ characters of feature string eill be used to
            create the new feature name/column <featurename[0:2]>_<component>.

        verbose: Default True
            True: output
            False: silent

        inplace: Default: True
            True: replace 1st argument with resulting dataframe
            False:  (boolean)change unplace the dataframe X

    Returns: pd.DataFrame  transformed into datetime feature components

    Raises:
        1. ValueError: if any dt_features = [].
        2. ValueError: if any feature has NA values.

    Note:
        Successful coercion to ``datetime`` costs approximately 100x more than if
        X[[dt_features]] was already of type datetime.

        Because of cost, a possible date will **NOT** be converted to ``datetime`` type.
-l 88 "$FilePath$"
        Another way, using a double negative is,
        if X[[dt_features]] is not of datetime type  (such as ``object`` type)
        then there **IS NO** attempt to coerce X[[dt_features]] to ``datetime`` type is made.

        It is best if raw data field
        is read/input in as ``datetime`` rather than ``object``. Another way, is to convert
        dataframe column using.

        Assumes **paso** `data
        cleaning steps (such as removal of Null and NA values)
        have already been applied.
    """
    _fun_name = toDatetimeComponents.__name__

    # todo put in decorator
    if inplace:
        X = oX
    else:
        X = oX.copy()

    if components == []:
        components = [k for k in DatetimetoComponents.COMPONENT_DICT.keys()]
    if not isDataFrame(X):
        raise_PasoError("{} not passed DataFrame".format(_fun_name))

    for feature in X.columns:
        _Check_No_NA_F_Values(X, feature)
        try:
            # object/srtr converted to dt, if possible
            Xt = X[feature].dtype
            if Xt == np.object:
                X[feature] = pd.to_datetime(X[feature])
            # set new component feature name
            if prefix:
                fn = feature + "_"
            else:
                fn = feature[0:DatetimetoComponents._PREFIX_LENGTH_] + "_"

            for component in components:
                if component.lower() == "Elapsed".lower():
                    X[fn + "Elapsed"] = (X[feature].astype(np.int64) //
                                         10**9).astype(np.int32)
                else:
                    X[fn + component] = getattr(
                        X[feature].dt, component.lower())  # ns to seconds

                if verbose:
                    logger.info("datetime feature component added: {}".format(
                        fn + component))
            if drop:
                X.drop(feature, axis=1, inplace=True)
            if verbose:
                logger.info("datetime feature dropped: {}".format(feature))
        except:
            pass  # tryed but in dt format

    return X
Example #15
0
def _formats_supported(path):
    for format in Inputers._formats_.keys():
        if path.endswith(format):
            return format
    raise raise_PasoError("format of this file not supported: {}".format(path))