def __init__(self, cat_cols=None, drop_original: bool = False, encoder=OrdinalEncoder()):
        """
        Categorical transformer. This is a wrapper for categorical encoders.

        :param cat_cols:
        :param drop_original:
        :param encoder:
        """
        self.cat_cols = cat_cols
        self.drop_original = drop_original
        self.encoder = encoder
        self.default_encoder = OrdinalEncoder()
예제 #2
0
    def fit(self, X_df, y=None):
        def regroup_cat(X, liste):
            if X not in liste:
                return ('other')
            else:
                return (X)

        self.prop_to_keep = [
            'Apartment', 'Serviced apartment', 'Condominium', 'Loft'
        ]
        self.prop_transformer = TargetEncoder()
        self.prop_transformer.fit(
            X_df['property_type'].apply(
                lambda x: regroup_cat(x, self.prop_to_keep)), y)

        self.pol_to_keep = [
            'flexible', 'strict_14_with_grace_period', 'moderate',
            'moderate_new'
        ]
        self.pol_transformer = TargetEncoder()
        self.pol_transformer.fit(
            X_df['cancellation_policy'].apply(
                lambda x: regroup_cat(x, self.pol_to_keep)), y)

        self.room_transformer = OrdinalEncoder()
        self.room_transformer.fit(X_df['room_type'])

        self.city_transformer = OneHotEncoder(handle_unknown='ignore')
        self.city_transformer.fit(pd.DataFrame(X_df['city_origin']))

        # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))])

        return self
예제 #3
0
    def fit(self, X, y=None, **kwargs):
        """

        :param X:
        :param y:
        :param kwargs:
        :return:
        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose,
                                              cols=self.cols)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [
                x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5
            ]

        return self
예제 #4
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        ordinal_mapping = self.ordinal_encoder.category_mapping

        mappings_out = []
        for switch in ordinal_mapping:

            values = [x[1] for x in switch.get('mapping')]
            column_mapping = self.fit_helmert_coding(values)
            mappings_out.append({
                'col': switch.get('col'),
                'mapping': column_mapping,
            })

        self.mapping = mappings_out

        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            generated_cols = get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]

        return self
예제 #5
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # first check the type
        X = util.convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        if self.handle_missing == 'error':
            if X[self.cols].isnull().any().bool():
                raise ValueError('Columns to be encoded can not contain null')

        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            handle_unknown='value',
            handle_missing='value'
        )
        self.ordinal_encoder = self.ordinal_encoder.fit(X)
        self.mapping = self.generate_mapping()

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = list(X_temp.columns)

        if self.drop_invariant:
            self.drop_cols = []
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
            try:
                [self.feature_names.remove(x) for x in self.drop_cols]
            except KeyError as e:
                if self.verbose > 0:
                    print("Could not remove column from feature names."
                    "Not found in generated cols.\n{}".format(e))

        return self
예제 #6
0
def create_regression_pipeline(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    numerical_indexes = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
    non_numerical_indexes = np.array([], int)
    one_hot_indexes_after_handle_missing_values = np.array([], int)
    ordinal_indexes_after_handle_missing_values = np.array([], int)

    pipeline = Pipeline(steps=[
        (
            "handle_missing_values",
            ColumnTransformer(
                [
                    ("imputer_mean", SimpleImputer(strategy="mean"),
                     numerical_indexes),
                    (
                        "imputer_mode",
                        SimpleImputer(strategy="most_frequent"),
                        non_numerical_indexes,
                    ),
                ],
                remainder="drop",
            ),
        ),
        (
            "handle categorical features",
            ColumnTransformer(
                [
                    (
                        "feature_encoder_ordinal",
                        OrdinalEncoder(),
                        ordinal_indexes_after_handle_missing_values,
                    ),
                    (
                        "feature_encoder_onehot",
                        OneHotEncoder(),
                        one_hot_indexes_after_handle_missing_values,
                    ),
                ],
                remainder="passthrough",
            ),
        ),
        ("estimator", LinearRegression(fit_intercept=True)),
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    return {
        'features_train': X_train,
        'features_test': X_test,
        'target_train': y_train,
        'target_test': y_test,
        'target_predicted': y_pred,
        'regression_pipeline': pipeline
    }
예제 #7
0
    def fit(self, X, y=None, **kwargs):
        """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds
        generally invariant columns to drop consistently.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown
        )
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        ordinal_mapping = self.ordinal_encoder.category_mapping

        mappings_out = []
        for switch in ordinal_mapping:
            values = [x[1] for x in switch.get('mapping')]
            column_mapping = self.fit_backward_difference_coding(values)
            mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, })

        self.mapping = mappings_out

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            generated_cols = get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]

        return self
예제 #8
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = util.convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        for col in self.cols:
            self.digits_per_col[col] = self.calc_required_digits(X, col)

        # do a transform on the training data to get a column list
        X_t = self.transform(X, override_return_df=True)
        self._encoded_columns = X_t.columns.values

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]

        return self
예제 #9
0
    def __init__(self, verbose=0, cols=None, drop_invariant=False):
        """

        :param verbose:
        :param cols:
        :return:
        """

        self.drop_invariant = drop_invariant
        self.drop_cols = []
        self.verbose = verbose
        self.cols = cols
        self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
예제 #10
0
	def ordinal_encode(self,data):
		"""
		複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。
		ベクトル化したデータセットを返します。
		変換規則はenc_dictに保存されています。

		:param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る)
		"""
		org_order=data.columns
		#self.enc_dict={}
		self.model=OrdinalEncoder(cols=self.columns,handle_unknown="inpute")
		oe_data=self.model.fit_transform(data)
		oe_data=oe_data.ix[:,org_order]
		return oe_data
예제 #11
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # first check the type
        X = util.convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)
        self.mapping = self.generate_mapping()

        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]

        return self
예제 #12
0
def get_single_encoder(encoder_name: str, cat_cols: list):
    """
    Get encoder by its name
    :param encoder_name: Name of desired encoder
    :param cat_cols: Cat columns for encoding
    :return: Categorical encoder
    """
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)
    if encoder_name == "OneHotEncoder":
        encoder = OneHotEncoder(cols=cat_cols)
    if encoder is None:
        raise NotImplementedError("To be implemented")
    return encoder
예제 #13
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [
                x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5
            ]

        return self
예제 #14
0
    def fit(self, X, y=None, **kwargs):
        self._dim = X.shape[1]

        if self.cols is None:
            self.cols = get_obj_cols(X)

        self.ordinal_encoder = OrdinalEncoder(cols=self.cols,
                                              handle_unknown='value',
                                              handle_missing='value')

        self.ordinal_encoder = self.ordinal_encoder.fit(X)
        self.mapping = self.generate_mapping()

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = list(X_temp.columns)

        return self
예제 #15
0
def create_clustering_pipeline(X):

    numerical_indexes = np.array([0, 1, 2, 3])
    non_numerical_indexes = np.array([], int)
    non_numerical_indexes_after_handle_missing_values = np.array([], int)

    pipeline = Pipeline(steps=[
        (
            "handle_missing_values",
            ColumnTransformer(
                [
                    ("imputer_mean", SimpleImputer(strategy="mean"),
                     numerical_indexes),
                    (
                        "imputer_mode",
                        SimpleImputer(strategy="most_frequent"),
                        non_numerical_indexes,
                    ),
                ],
                remainder="drop",
            ),
        ),
        (
            "handle_categorical_features",
            ColumnTransformer(
                [(
                    "feature_encoder",
                    OrdinalEncoder(),
                    non_numerical_indexes_after_handle_missing_values,
                )],
                remainder="passthrough",
            ),
        ),
        (
            "estimator",
            KMeans(n_clusters=3, n_init=10, max_iter=300),
        ),
    ])

    _ = pipeline.fit_transform(X)

    clusters = pipeline.named_steps.estimator.labels_

    return {'clusters': clusters, 'clustering_pipeline': pipeline}
예제 #16
0
def cross_validate_forest(X_train, y_train, pipes, grids, kfolds, random_search=False):
    """Cross-validate RandomForestClassifier pipeline."""
    pipes['forest'] = make_pipeline(CategoricalToString(),
                                    SimpleDataFrameImputer(median_cols=['Age', 'Fare'],
                                                           mode_cols=['Embarked']),
                                    OrdinalEncoder(cols=['Title', 'Deck', 'Embarked'],
                                                   handle_unknown='impute'),
                                    RandomForestClassifier(**{'bootstrap': True,
                                                              'max_depth': 70,
                                                              'max_features': 'auto',
                                                              'min_samples_leaf': 4,
                                                              'min_samples_split': 10,
                                                              'n_estimators': 64,
                                                              'random_state': RANDOM_SEED}))
    if random_search:
        n_estimators = [int(x) for x in np.linspace(start=10, stop=500, num=10)]
        max_features = ['auto', 'sqrt']
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth.append(None)
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [1, 2, 4]
        bootstrap = [True, False]

        random_grid = {'randomforestclassifier__n_estimators': n_estimators,
                       'randomforestclassifier__max_features': max_features,
                       'randomforestclassifier__max_depth': max_depth,
                       'randomforestclassifier__min_samples_split': min_samples_split,
                       'randomforestclassifier__min_samples_leaf': min_samples_leaf,
                       'randomforestclassifier__bootstrap': bootstrap}
        pprint.pprint(random_grid)
        randsearch = RandomizedSearchCV(pipes['forest'], random_grid, n_iter=50, cv=3,
                                        verbose=0, random_state=42)
        start = time.time()
        randsearch.fit(X_train, y_train)
        finish = time.time()
        print('randsearch.fit execution time:', finish - start)
        pprint.pprint(randsearch.best_params_)

    forest = ExtendedClassifier.cross_validate(pipes['forest'], X_train, y_train,
                                               sklearn_cvs_kws={'cv': kfolds},
                                               param_strategy='init',
                                               logdir_path=r'logs/models/forest',
                                               serialize_to=r'models/forest.pickle')
    return forest
예제 #17
0
def get_single_encoder(encoder_name: str, cat_cols: list):
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == 'OneHotEncoder':
        encoder = OneHotEncoder(cols=cat_cols)

    # assert encoder is not None
    return encoder
예제 #18
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # first check the type
        X = convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose,
                                              cols=self.cols)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [
                x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5
            ]

        return self
예제 #19
0
    def fit(self, X, y=None, **kwargs):
        """

        :param X:
        :param y:
        :param kwargs:
        :return:
        """

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)

        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]

        return self
예제 #20
0
    def ordinal_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            mapping: list of dict
                a mapping of class to label to use for the encoding, optional.
                the dict contains the keys 'col' and 'mapping'.
                the value of 'col' should be the feature name.
                the value of 'mapping' should be a dictionary of 'original_label' to 'encoded_label'.
                example mapping: [{'col': 'col1', 'mapping': {None: 0, 'a': 1, 'b': 2}}]
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1.
            handle_missing: str
                options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time,
                or -2 at transform time if nan is not a category during fit.
        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")

        encoder = OrdinalEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                 handle_unknown=handle_unknown, handle_missing=handle_missing)

        res = encoder.fit_transform(X, y)

        return res
예제 #21
0
def fetch_TICTACTOE(path, valid_size=0.2, test_size=0.2, seed=None):

    path = Path(path)
    data_path = path / 'tic-tac-toe.data'

    if not data_path.exists():
        path.mkdir(parents=True, exist_ok=True)

        download(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data',
            data_path)

    data = pd.read_csv(data_path, names=np.arange(10))
    encoder = OrdinalEncoder(return_df=False)
    data = encoder.fit_transform(data)

    X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        stratify=Y,
                                                        test_size=test_size,
                                                        random_state=seed)

    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      stratify=y_train,
                                                      test_size=valid_size /
                                                      (1 - test_size),
                                                      random_state=seed)

    return dict(X_train=X_train,
                y_train=y_train,
                X_valid=X_val,
                y_valid=y_val,
                X_test=X_test,
                y_test=y_test)
예제 #22
0
def EncodeCategoricalData(train_df, test_df):
    """encode data with OrdinalEncoder

            Parameters
            ----------
            train_df: dataframe
                training dataframe object to fit and transform
            test_df: dataframe
                test dataframe object to transform

            Returns
            -------
            transformed training and test dataframe
            """

    # column list to ordinal encode
    ordinal_encode_cols = ["country", "province",
                           "region_1", "taster_name", "variety"]

    # create ordinal encode object
    # object assigns -1 to the first-time-seen values of the test set
    ordinal_encoder = OrdinalEncoder(cols=ordinal_encode_cols,
                                     return_df=True,
                                     handle_unknown="value",
                                     handle_missing="return_nan")

    # fit object on the train dataset
    ordinal_encoder.fit(train_df)

    # transform train and test datasets
    ord_encoded_train = (ordinal_encoder
                         .transform(train_df))

    ord_encoded_test = (ordinal_encoder
                        .transform(test_df))

    return ord_encoded_train, ord_encoded_test
예제 #23
0
    test_data = fg.generate()
    test_data.columns = [str(col) for col in test_data.columns]
    test_data = test_data.fillna(0)

    test_data = add_datepart(test_data, 'timestamp', drop=False)

    prediction = cat_model.predict(test_data) + lgb_model.predict(test_data) + xgb_model.predict(test_data)
    return prediction

if __name__ == "__main__":
    path = 'C:/Work/gitsrc/Kaggle/data-science-bowl-2019'
    test = pd.read_csv(f'{path}/test.csv')

    preprocessedData = pd.read_csv("intermediate.csv")

    title_oe = OrdinalEncoder()
    title_oe.fit(list(set(preprocessedData['session_title'].unique()).union(set(test['title'].unique()))))
    world_oe = OrdinalEncoder()
    world_oe.fit(list(set(preprocessedData['world'].unique()).union(set(test['world'].unique()))))

    preprocessedData['session_title'] = title_oe.transform(preprocessedData['session_title'].values)
    preprocessedData['world'] = world_oe.transform(preprocessedData['world'].values)

    lgb_model, xgb_model, cat_model = train(preprocessedData)

    test['title'] = title_oe.transform(test['title'].values)
    test['world'] = world_oe.transform(test['world'].values)

    prediction = predict(test, lgb_model, xgb_model, cat_model)

    sample_submission = pd.read_csv(f'{path}/sample_submission.csv')
    def fit(self, X, y=None, **kwargs):
        """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds
        generally invariant columns to drop consistently.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = util.convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown)
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        ordinal_mapping = self.ordinal_encoder.category_mapping

        mappings_out = []
        for switch in ordinal_mapping:
            values = switch.get('mapping').get_values()
            column_mapping = self.fit_backward_difference_coding(values)
            mappings_out.append({
                'col': switch.get('col'),
                'mapping': column_mapping,
            })

        self.mapping = mappings_out

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = X_temp.columns.tolist()

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]
            try:
                [self.feature_names.remove(x) for x in self.drop_cols]
            except KeyError as e:
                if self.verbose > 0:
                    print("Could not remove column from feature names."
                          "Not found in generated cols.\n{}".format(e))

        return self
예제 #25
0
    def fit(self, X, y, **kwargs):
        """Fit encoder according to X and binary y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Binary target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # Unite parameters into pandas types
        X = util.convert_input(X)
        y = util.convert_input_vector(y, X.index).astype(float)

        # The lengths must be equal
        if X.shape[0] != y.shape[0]:
            raise ValueError("The length of X is " + str(X.shape[0]) +
                             " but length of y is " + str(y.shape[0]) + ".")

        self._dim = X.shape[1]

        # If columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        if self.handle_missing == 'error':
            if X[self.cols].isnull().any().any():
                raise ValueError('Columns to be encoded can not contain null')

        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose,
                                              cols=self.cols,
                                              handle_unknown='value',
                                              handle_missing='value')
        self.ordinal_encoder = self.ordinal_encoder.fit(X)
        X_ordinal = self.ordinal_encoder.transform(X)

        # Training
        self.mapping = self._train(X_ordinal, y)

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = X_temp.columns.tolist()

        # Store column names with approximately constant variance on the training data
        if self.drop_invariant:
            self.drop_cols = []
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]
            try:
                [self.feature_names.remove(x) for x in self.drop_cols]
            except KeyError as e:
                if self.verbose > 0:
                    print("Could not remove column from feature names."
                          "Not found in generated cols.\n{}".format(e))
        return self
예제 #26
0
    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # if the input dataset isn't already a dataframe, convert it to one (using default column names)
        # first check the type
        X = util.convert_input(X)

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        if self.handle_missing == 'error':
            if X[self.cols].isnull().any().bool():
                raise ValueError('Columns to be encoded can not contain null')

        # train an ordinal pre-encoder
        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose,
                                              cols=self.cols,
                                              handle_unknown='value',
                                              handle_missing='value')
        self.ordinal_encoder = self.ordinal_encoder.fit(X)

        ordinal_mapping = self.ordinal_encoder.category_mapping

        mappings_out = []
        for switch in ordinal_mapping:
            values = switch.get('mapping')
            col = switch.get('col')
            column_mapping = self.fit_sum_coding(col, values,
                                                 self.handle_missing,
                                                 self.handle_unknown)
            mappings_out.append({
                'col': switch.get('col'),
                'mapping': column_mapping,
            })

        self.mapping = mappings_out

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = X_temp.columns.tolist()

        # drop all output columns with 0 variance.
        if self.drop_invariant:
            self.drop_cols = []
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]
            try:
                [self.feature_names.remove(x) for x in self.drop_cols]
            except KeyError as e:
                if self.verbose > 0:
                    print("Could not remove column from feature names."
                          "Not found in generated cols.\n{}".format(e))

        return self
예제 #27
0
    def fit(self, X_df, y):

        collectivite = 'collectivite'
        annee = 'anneeBudg'
        obj = 'objet du dossier'
        direction = 'direction'
        nature = 'Nature de la subvention'
        beneficiaire = 'beneficiaire'
        secteur = 'secteur activite'
        drop_cols = ["numDoc", "siret"]

        def colect(X):
            return X.values[:, np.newaxis]

        colectivite_transformer = FunctionTransformer(colect, validate=False)

        def objetdoss(X):
            return X.values[:, np.newaxis]

        obj_transformer = FunctionTransformer(objetdoss, validate=False)

        def direct(X):
            return X.values[:, np.newaxis]

        direction_transformer = FunctionTransformer(direct, validate=False)

        def nature_t(X):
            return X.values[:, np.newaxis]

        nature_transformer = FunctionTransformer(nature_t, validate=False)

        def beneficiaire_t(X):
            return X.values[:, np.newaxis]

        beneficiaire_transformer = FunctionTransformer(beneficiaire_t,
                                                       validate=False)

        def secteur_t(X):
            return X.values[:, np.newaxis]

        secteur_transformer = FunctionTransformer(secteur_t, validate=False)

        def annee_t(X):
            return X.values[:, np.newaxis]

        annee_transformer = FunctionTransformer(annee_t, validate=False)

        preprocessor = ColumnTransformer(transformers=[
            ('col',
             make_pipeline(colectivite_transformer, OrdinalEncoder(),
                           SimpleImputer(strategy='median')), collectivite),
            ('annee',
             make_pipeline(annee_transformer, SimpleImputer(
                 strategy='median')), annee),
            ('dir',
             make_pipeline(direction_transformer, OrdinalEncoder(),
                           SimpleImputer(strategy='median')), direction),
            ('nature',
             make_pipeline(nature_transformer, OrdinalEncoder(),
                           SimpleImputer(strategy='median')), nature),
            ('beneficiaire',
             make_pipeline(beneficiaire_transformer, OrdinalEncoder(),
                           SimpleImputer(strategy='median')), beneficiaire),
            ('sect',
             make_pipeline(secteur_transformer, OrdinalEncoder(),
                           SimpleImputer(strategy='median')), secteur),
            ('obj',
             make_pipeline(obj_transformer, OrdinalEncoder(),
                           SimpleImputer(strategy='median')), obj),
            ('drop cols', 'drop', drop_cols),
        ])

        self.preprocessor = preprocessor
        self.preprocessor.fit(X_df, y)
        return self
예제 #28
0
    def fit(self, X, y, **kwargs):
        """Fit encoder according to X and binary y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Binary target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # Unite parameters into pandas types
        X = util.convert_input(X)
        y = util.convert_input_vector(y, X.index).astype(float)

        # The lengths must be equal
        if X.shape[0] != y.shape[0]:
            raise ValueError("The length of X is " + str(X.shape[0]) +
                             " but length of y is " + str(y.shape[0]) + ".")

        self._dim = X.shape[1]

        # If columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = util.get_obj_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        if self.handle_missing == 'error':
            if X[self.cols].isnull().any().bool():
                raise ValueError('Columns to be encoded can not contain null')

        self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose,
                                              cols=self.cols,
                                              handle_unknown='value',
                                              handle_missing='value')
        self.ordinal_encoder = self.ordinal_encoder.fit(X)
        X_ordinal = self.ordinal_encoder.transform(X)

        # Training
        if self.model == 'independent':
            self.mapping = self._train_independent(X_ordinal, y)
        elif self.model == 'pooled':
            self.mapping = self._train_pooled(X_ordinal, y)
        elif self.model == 'beta':
            self.mapping = self._train_beta(X_ordinal, y)
        elif self.model == 'binary':
            # The label must be binary with values {0,1}
            unique = y.unique()
            if len(unique) != 2:
                raise ValueError(
                    "The target column y must be binary. But the target contains "
                    + str(len(unique)) + " unique value(s).")
            if y.isnull().any():
                raise ValueError(
                    "The target column y must not contain missing values.")
            if np.max(unique) < 1:
                raise ValueError(
                    "The target column y must be binary with values {0, 1}. Value 1 was not found in the target."
                )
            if np.min(unique) > 0:
                raise ValueError(
                    "The target column y must be binary with values {0, 1}. Value 0 was not found in the target."
                )
            # Perform the training
            self.mapping = self._train_log_odds_ratio(X_ordinal, y)
        else:
            raise ValueError("model='" + str(self.model) +
                             "' is not a recognized option")

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = X_temp.columns.tolist()

        # Store column names with approximately constant variance on the training data
        if self.drop_invariant:
            self.drop_cols = []
            generated_cols = util.get_generated_cols(X, X_temp, self.cols)
            self.drop_cols = [
                x for x in generated_cols if X_temp[x].var() <= 10e-5
            ]
            try:
                [self.feature_names.remove(x) for x in self.drop_cols]
            except KeyError as e:
                if self.verbose > 0:
                    print("Could not remove column from feature names."
                          "Not found in generated cols.\n{}".format(e))
        return self
예제 #29
0
 def __init__(self, cat_cols=None, drop_original: bool = False, encoder=OrdinalEncoder()):
     self.cat_cols = cat_cols
     self.drop_original = drop_original
     self.encoder = encoder
     self.default_encoder = OrdinalEncoder()
 def fit(self, X):
     for column in X.columns:
         if X[column].dtype.name  in ["object", "category"]:
             self.label_encoders[column] = OrdinalEncoder()
             self.label_encoders[column].fit(X[column])
     return self