Ejemplo n.º 1
0
 def __init__(self, cols=None):
     self.enc = LeaveOneOutEncoder(
         cols=cols,
         verbose=1,
         drop_invariant=False,
         return_df=True,
         handle_unknown="value",
         handle_missing="value",
         random_state=1,
         sigma=0,
     )
Ejemplo n.º 2
0
    def loo_encode(self, data):
        """
		複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。
		ベクトル化したデータセットを返します。
		変換規則はenc_dictに保存されています。

		:param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る)
		"""
        org_order = data.columns
        print(self.columns)
        #self.enc_dict={}
        oe = LeaveOneOutEncoder(cols=self.columns, handle_unknown="inpute")
        oe_data = oe.fit_transform(data, data[self.target_colname])
        self.model = oe
        oe_data = oe_data.ix[:, org_order]
        return oe_data
Ejemplo n.º 3
0
def process_classification_dataset(name):
    # converting categorical features to numerical

    data_dir = os.path.join('datasets', name)
    train_file = os.path.join(data_dir, 'full_train')
    test_file = os.path.join(data_dir, 'test')
    cd_file = os.path.join(data_dir, 'pool.cd')

    train = np.loadtxt(train_file, delimiter="\t", dtype="object")
    test = np.loadtxt(test_file, delimiter="\t", dtype="object")
    cd = read_cd(cd_file, data_file=train_file)

    # Target can be called 'Label' or 'Target' in pool.cd
    try:
        label_ind = cd['column_type_to_indices']['Label']
    except:
        label_ind = cd['column_type_to_indices']['Target']

    np.random.seed(42)  # fix random seed
    train = np.random.permutation(train)

    y_train = train[:, label_ind]
    y_train = y_train.reshape(-1)

    y_test = test[:, label_ind]
    y_test = y_test.reshape(-1)

    cat_features = cd['column_type_to_indices'][
        'Categ']  # features to be replaced

    enc = LeaveOneOutEncoder(cols=cat_features,
                             return_df=False,
                             random_state=10,
                             sigma=0.3)

    transformed_train = enc.fit_transform(train, y_train).astype("float64")
    X_train = np.delete(transformed_train, label_ind,
                        1)  # remove target column

    transformed_test = enc.transform(test).astype("float64")
    X_test = np.delete(transformed_test, label_ind, 1)  # remove target column

    return np.nan_to_num(X_train), y_train, np.nan_to_num(X_test), y_test, enc
Ejemplo n.º 4
0
class LooEncoder(object):
    def __init__(self, cols=None):
        self.enc = LeaveOneOutEncoder(
            cols=cols,
            verbose=1,
            drop_invariant=False,
            return_df=True,
            handle_unknown="value",
            handle_missing="value",
            random_state=1,
            sigma=0,
        )

    def fit(self, X, y):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.enc.fit(X, y)

    def transform(self, X):
        return self.enc.transform(X)

    def to_json(self):
        data_json = {
            "cols": self.enc.cols,
            "dim": self.enc._dim,
            "mean": float(self.enc._mean),
            "feature_names": self.enc.feature_names,
            "mapping": {},
        }
        for k, v in self.enc.mapping.items():
            data_json["mapping"][k] = v.to_json()
        return data_json

    def from_json(self, data_json):
        self.enc.cols = data_json.get("cols")
        self.enc._dim = data_json.get("dim")
        self.enc._mean = data_json.get("mean")
        self.enc.feature_names = data_json.get("feature_names")
        self.enc.mapping = {}
        for k, v in data_json.get("mapping", {}).items():
            self.enc.mapping[k] = pd.DataFrame(json.loads(v))
Ejemplo n.º 5
0
    def leaveone_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            handle_missing: str
                options are 'error', 'return_nan'  and 'value', defaults to 'value', which returns the target mean.
            handle_unknown: str
                options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
            sigma: float
                adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing
                data are untouched). Sigma gives the standard deviation (spread or "width") of the normal distribution.
                The optimal value is commonly between 0.05 and 0.6. The default is to not add noise, but that leads
                to significantly suboptimal results.
        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        random_state = set_default_vale("random_state", configger, None)
        sigma = set_default_vale("sigma", configger, None)

        encoder = LeaveOneOutEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                     handle_unknown=handle_unknown, handle_missing=handle_missing,
                                     random_state=random_state, sigma=sigma)

        res = encoder.fit_transform(X, y)

        return res
Ejemplo n.º 6
0
def study_loo(X, X_kaggle, data1, mean_enc_column_names, test_data, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=2834)
    X_train = pd.DataFrame(X_train, columns=data1.columns[2:])
    X_test = pd.DataFrame(X_test, columns=data1.columns[2:])
    loo = LeaveOneOutEncoder(cols=mean_enc_column_names, random_state=2834)
    rf = RandomForestRegressor(n_estimators=400, random_state=2834, n_jobs=-1)
    pipe = Pipeline(steps=[('loo', loo), ('rf', rf)])
    param_distribution = {
        'loo__sigma': LogUniformDistribution(1E-5, 1E-1),
        'rf__max_depth': IntUniformDistribution(2, 40),
        'rf__max_features': IntUniformDistribution(1, X_test.shape[1]),
        'rf__min_samples_leaf': IntUniformDistribution(1, 15)
    }
    search = optuna.integration.OptunaSearchCV(pipe,
                                               param_distribution,
                                               cv=5,
                                               n_jobs=-1,
                                               random_state=514,
                                               n_trials=40,
                                               timeout=None,
                                               scoring='r2')
    search.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)
    test_predict = search.best_estimator_.predict(X_test)
    test_score = r2_score(y_test, test_predict)
    print('Test R^2: ', test_score)
    # In[32]:
    preds_kaggle = search.best_estimator_.predict(
        pd.DataFrame(X_kaggle, columns=data1.columns[2:]))
    preds_kaggle_df = pd.DataFrame({
        'ID': test_data.ID,
        'y': preds_kaggle,
    })
    preds_kaggle_df.head(2)
    preds_kaggle_df.to_csv('loo_submission.csv', index=False)
    # Saving CV results:
    with open("studies/regression_loo_cv.json", 'w') as json_file:
        json.dump(dump_optuna_results(
            search, test_score,
            search.best_estimator_.named_steps['rf'].feature_importances_),
                  json_file,
                  indent=4)
    return X_test, test_score
Ejemplo n.º 7
0
def get_single_encoder(encoder_name: str, cat_cols: list):
    """
    Get encoder by its name
    :param encoder_name: Name of desired encoder
    :param cat_cols: Cat columns for encoding
    :return: Categorical encoder
    """
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)
    if encoder_name == "OneHotEncoder":
        encoder = OneHotEncoder(cols=cat_cols)
    if encoder is None:
        raise NotImplementedError("To be implemented")
    return encoder
Ejemplo n.º 8
0
def get_single_encoder(encoder_name: str, cat_cols: list):
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == 'OneHotEncoder':
        encoder = OneHotEncoder(cols=cat_cols)

    # assert encoder is not None
    return encoder
def study_loo(X, train_columns, mean_enc_column_names, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=2834)
    X_train = pd.DataFrame(X_train, columns=train_columns)
    X_test = pd.DataFrame(X_test, columns=train_columns)
    loo = LeaveOneOutEncoder(cols=mean_enc_column_names, random_state=2834)
    rf = RandomForestClassifier(n_estimators=400, random_state=2834, n_jobs=-1)
    pipe = Pipeline(steps=[('loo', loo), ('rf', rf)])
    param_distribution = {
        'loo__sigma': LogUniformDistribution(1E-5, 1E-1),
        'rf__max_depth': IntUniformDistribution(2, 40),
        'rf__max_features': IntUniformDistribution(1, X_test.shape[1]),
        'rf__min_samples_leaf': IntUniformDistribution(1, 15)
    }
    search = optuna.integration.OptunaSearchCV(pipe,
                                               param_distribution,
                                               cv=5,
                                               n_jobs=-1,
                                               random_state=514,
                                               n_trials=40,
                                               timeout=None,
                                               scoring='accuracy')
    search.fit(X_train, y_train)
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)
    test_predict = search.best_estimator_.predict(X_test)
    test_score = accuracy_score(y_test, test_predict)
    print('Test accuracy: ', test_score)

    # Saving CV results:
    with open("studies/adult_loo_cv.json", 'w') as json_file:
        json.dump(dump_optuna_results(
            search, test_score,
            search.best_estimator_.named_steps['rf'].feature_importances_),
                  json_file,
                  indent=4)
    return X_test, test_score