def one_hot_encode(self, dataframe):
        one_cols = [
            'events', 'zone', 'pitch_type', 'type', 'home_team', 'away_team',
            'pitch_count', 'L1_pitch_type',	'L1_pitch_result', 'L1_pitch_zone',
            '_count', 'count_cat',	'pitch_cat', 'balls', 'strikes','inning',
            'outs_when_up', 'batting_order_slot', 'pitch_subtype', 'count_status'

            ]

        one_hot_cols = []
        for col in one_cols:
            if col in dataframe.columns.tolist():
                one_hot_cols.append(col)


        # Instantiate Encoder
        one_hot_encoder = OneHotEncoder(cols=one_hot_cols,
                                        return_df=True,
                                        use_cat_names=True)
        # Encode features
        encoded = one_hot_encoder.fit_transform(dataframe[one_hot_cols],
                                                dataframe['next_pitch'])

        # Join encoded features into df and drop old columns
        dataframe = dataframe.join(encoded).drop(columns = one_hot_cols)
        return dataframe
Exemple #2
0
    def fit(self, X, y):
        """
        Generates and optimizes all legitimate pipelines. The best pipeline can be retrieved from `self.best_estimator_`

        :param X: Training data
        :param y: Corresponding observations
        :return: `self`
        """
        _X, _y = X, y
        if self.cat_cols is not None:
            from category_encoders.one_hot import OneHotEncoder

            enc = OneHotEncoder(
                cols=self.cat_cols, return_df=False, handle_unknown="ignore"
            )
            enc.fit(X)
            _X = enc.transform(X)

        X_, y_ = _X, _y
        self.num_features = len(X_[0])
        for l in range(1, self.length + 1):
            self._cast(l, X_, y_)
        self.best_estimator_ = list(self.get_top(1).items())[0][1][0]
        self.best_estimator_score = list(self.get_top(1).items())[0][1][1]
        return self
Exemple #3
0
def encode_genotypes(df):
    """One-hot encode the genotypes

    :param df: A DataFrame of samples with genotypes as columns
    :type df: pandas DataFrame
    :return: pandas DataFrame of one-hot encoded columns for genotypes and OHE instance
    :rtype: pandas DataFrame, OneHotEncoder instance
    """
    ohe = OneHotEncoder(cols=df.columns, handle_missing="return_nan")
    X = ohe.fit_transform(df)
    return pd.DataFrame(X, index=df.index), ohe
Exemple #4
0
def transform_dataset(dataset: pd.DataFrame):
    enc = OneHotEncoder()
    cleaned_dataset = dataset.replace({
        'yes': 1,
        'no': 0,
        'success': 1,
        'failure': 0,
        'unknown': np.nan,
        'other': np.nan,
    })
    transformed = enc.fit_transform(cleaned_dataset)
    print(transformed)
    return transformed
Exemple #5
0
 def build_pipeline(self):
     """
     Makes a pipeline based on data_config
     This is because autosklearn does not perform automatic data encoding
     """
     categorical_list = infer_categoricals(self.X)
     preprocessing_steps = []
     if self.data_config.get("text_columns"):
         print("Applying TFIDF to text columns: {data_config.get('text_columns')}")
         preprocessing_steps.append(make_pipeline(
             ColumnSelector(cols=data_config.get("text_columns"), drop_axis=True),
             TfidfVectorizer()
         ))
         categorical_list = [c for c in categorical_list if c not in data_config["text_columns"]]
     if categorical_list:
         print(f"Applying One Hot Encoding to categorical columns: {categorical_list}")
         preprocessing_steps.append(make_pipeline(
             ColumnSelector(cols=categorical_list),
             OneHotEncoder(handle_unknown="impute")
         ))
     if preprocessing_steps:
         preprocessing_steps = make_union(*preprocessing_steps)
         preprocessing_steps = make_pipeline(preprocessing_steps, SimpleImputer())
     else:
         preprocessing_steps = SimpleImputer()
     if self.problem_type == "classification":
         automl = TPOTClassifier(**self.automl_settings)
     else:
         automl = TPOTRegressor(**self.automl_settings)
     automl_pipeline = make_pipeline(
         preprocessing_steps,
         automl
     )
     return automl_pipeline
def onehot_or_targ(X, y, categorical, k):
    ''' Returns the X, y with encoded categorical variables based on a threshold
     value k.

    Parameters:
    -----------
    X: pd.DataFrame
        Contains the dataframe of a given dataset excluding its target column.
    y: pd.Series
        Contains the series of the target of a given dataset.
    categorical: list
        Contains the names of the categorical columns.
    k: int
        Contains threshold value to determine whether to perform target encoding
        or one-hot encoding.

    Returns:
    --------
    pd.DataFrame, pd.Series
        Contains an updated pd.DataFrame with encoding of categorical features,
            contains an updated pd.Series with encoding of a categorical target.
    '''
    for column in categorical:
        if len(X[column].unique()) > k:
            if X[column].dtype.name == 'category':
                X[column] = X[column].cat.codes
            if y.dtype.name == 'category':
                y = y.cat.codes
            X = TargetEncoder(cols=[column]).fit_transform(X, y)
        else:
            X = OneHotEncoder(cols=[column]).fit_transform(X)
    return X, y
Exemple #7
0
    def one_hot_encode(self):
        one_hot_cols = [
            'events', 'zone', 'pitch_type', 'type', 'home_team', 'away_team',
            'pitch_count', 'L1_pitch_type',	'L1_pitch_result', 'L1_pitch_zone',
            '_count', 'count_cat',	'pitch_cat', 'balls', 'strikes','inning',
            'outs_when_up', 'batting_order_slot'
            ]
        # Instantiate Encoder
        one_hot_encoder = OneHotEncoder(cols=one_hot_cols,
                                        return_df=True,
                                        use_cat_names=True)
        # Encode features
        encoded = one_hot_encoder.fit_transform(self.df[one_hot_cols],
                                                self.df['next_pitch'])

        # Join encoded features into df and drop old columns
        self.df = self.df.join(encoded).drop(columns = one_hot_cols + ['events_0'])
Exemple #8
0
def create_regression_pipeline(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    numerical_indexes = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
    non_numerical_indexes = np.array([], int)
    one_hot_indexes_after_handle_missing_values = np.array([], int)
    ordinal_indexes_after_handle_missing_values = np.array([], int)

    pipeline = Pipeline(steps=[
        (
            "handle_missing_values",
            ColumnTransformer(
                [
                    ("imputer_mean", SimpleImputer(strategy="mean"),
                     numerical_indexes),
                    (
                        "imputer_mode",
                        SimpleImputer(strategy="most_frequent"),
                        non_numerical_indexes,
                    ),
                ],
                remainder="drop",
            ),
        ),
        (
            "handle categorical features",
            ColumnTransformer(
                [
                    (
                        "feature_encoder_ordinal",
                        OrdinalEncoder(),
                        ordinal_indexes_after_handle_missing_values,
                    ),
                    (
                        "feature_encoder_onehot",
                        OneHotEncoder(),
                        one_hot_indexes_after_handle_missing_values,
                    ),
                ],
                remainder="passthrough",
            ),
        ),
        ("estimator", LinearRegression(fit_intercept=True)),
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    return {
        'features_train': X_train,
        'features_test': X_test,
        'target_train': y_train,
        'target_test': y_test,
        'target_predicted': y_pred,
        'regression_pipeline': pipeline
    }
    def _perform_categ_fit(self, df, y):
        # https://github.com/scikit-learn-contrib/categorical-encoding
        # https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
        # https://en.wikipedia.org/wiki/Feature_hashing#Feature_vectorization_using_the_hashing_trick
        categ_cols = {}
        onehot_cols = []
        for col in self.categorical_vars:
            categs = df[col].astype(
                pd.api.types.CategoricalDtype()).cat.categories
            if self.categ_enc_method == "onehot":
                card = df[col].nunique()
                if card > 10:
                    print("Warning, cardinality of {} = {}".format(col, card))
                onehot_cols.append(col)
            elif self.categ_enc_method == "target":
                if self.tfs_list["y"] is None:
                    raise Exception(
                        "You have to pass your target variable to the fit() "
                        "function for target encoding")
                # Mean/target/likelihood encoding
                target_col_name = self.tfs_list["y"].name
                df_enc = df.copy()
                df_enc[target_col_name] = self.tfs_list["y"]
                cumsum = df_enc.groupby(
                    col)[target_col_name].cumsum() - df_enc[target_col_name]
                cumcnt = df_enc.groupby(col)[target_col_name].cumcount()
                means = cumsum / cumcnt
                means.rename('mean_enc', inplace=True)

                mean_enc = pd.Series(means, index=self.tfs_list["y"]).to_dict()
                global_mean = self.tfs_list["y"].mean()
                categ_cols[col] = {"target": (global_mean, mean_enc)}
            elif self.categ_enc_method == "hashing":
                str_hashs = [col + "=" + str(val) for val in categs]
                hashs = [hash(h) % self.hash_space for h in str_hashs]
                categ_cols[col] = {"hashing": hashs}
        if len(onehot_cols) > 0:
            enc = CategOneHot(cols=onehot_cols, handle_unknown='impute')
            enc.fit(df)
            self.tfs_list["onehot"] = enc
        self.tfs_list["categ_cols"] = categ_cols
    def one_hot_encoder(self, df, configger):
        """

        :param df: the train dataset.
        :param configger: the json str of configger setting, the params means:
            verbose: int
                integer indicating verbosity of the output. 0 for none.
            cols: list
                a list of columns to encode, if None, all string columns will be encoded.
            drop_invariant: bool
                boolean for whether or not to drop columns with 0 variance.
            return_df: bool
                boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
            use_cat_names: bool
                if True, category values will be included in the encoded column names. Since this can result in duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated.
                If False, category indices will be used instead of the category values.
            handle_unknown: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has unknown categories.  This can cause
                unexpected changes in dimension in some cases.
            handle_missing: str
                options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
                an extra column will be added in if the transform matrix has nan values.  This can cause
                unexpected changes in dimension in some cases.

        :return: the transform result
        """
        X, y, encode_col = self.get_Xy(df, configger)

        drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True)
        handle_missing = set_default_vale("handle_missing", configger, "value")
        handle_unknown = set_default_vale("handle_unknown", configger, "value")
        use_cat_names = set_default_vale("use_cat_names", configger, False, is_bool=True)

        encoder = OneHotEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True,
                                use_cat_names=use_cat_names,
                                handle_unknown=handle_unknown, handle_missing=handle_missing)

        res = encoder.fit_transform(X, y)

        return res
def get_single_encoder(encoder_name: str, cat_cols: list):
    """
    Get encoder by its name
    :param encoder_name: Name of desired encoder
    :param cat_cols: Cat columns for encoding
    :return: Categorical encoder
    """
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)
    if encoder_name == "OneHotEncoder":
        encoder = OneHotEncoder(cols=cat_cols)
    if encoder is None:
        raise NotImplementedError("To be implemented")
    return encoder
def get_single_encoder(encoder_name: str, cat_cols: list):
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == 'OneHotEncoder':
        encoder = OneHotEncoder(cols=cat_cols)

    # assert encoder is not None
    return encoder
    def read_data(self,
                  data_path,
                  y_col,
                  index_col=None,
                  skip_cols=None,
                  test_size=0.3):
        """
        Read a csv file with data.

        Categorical variables will be encoded according to the one hot
        scheme.

        Parameters
        ----------
        data_path : str
            Path to the csv file with data.
        y_col : str
            Ground truth labels with values of 0 and 1.
        index_col : int, sequence or bool, optional
            Column to use as the row labels of the DataFrame. If a sequence is
            given, MultiIndex is used.
        skip_cols : list
            List of features / columns to exclude from the data.
        test_size : float
            Should be between 0.0 and 1.0 and represent the proportion of the
            dataset to include in the test split.

        Returns
        -------
        data_splits : list, length = 4
            List containing train-test split.

        """
        data = pd.read_csv(data_path,
                           index_col=index_col,
                           na_values=['NONE', 'na'])
        # drop columns with where N/As constitute around 10% of all entries
        na_max_percent = 0.1
        nas = data.isna().sum()
        excessive_na_cols = set(nas[nas > na_max_percent * len(data)].index)
        excessive_na_cols = excessive_na_cols.union(set(skip_cols))
        data_cols = set(data.columns).difference(excessive_na_cols)

        if y_col not in data_cols:
            raise ValueError(f'Too many enties without the labels {y_col}')

        numeric_cols = set(
            data._get_numeric_data().columns).difference(excessive_na_cols)
        # since y_col contains 0, 1 it should be numeric
        categorical_cols = data_cols - numeric_cols
        numeric_cols.remove(y_col)

        data = data.loc[:, data_cols]
        data = data.dropna()
        X = data.loc[:, numeric_cols.union(categorical_cols)]
        y = data.is_bad.values

        # encode categorical variables
        encoder = OneHotEncoder(cols=categorical_cols, use_cat_names=True)
        X = encoder.fit_transform(X)
        data_splits = train_test_split(X,
                                       y,
                                       test_size=test_size,
                                       random_state=self._random_state)

        return data_splits
Exemple #14
0
    def eoa_fit(self, X, y, **kwargs):
        """
        Applies evolutionary optimization methods to find an optimum pipeline

        :param X: Training data
        :param y: Corresponding observations
        :param kwargs: `EOA` parameters
        :return: `self`
        """
        from .structsearch import BoxSample, CompactSample
        from .eoa import EOA
        _X, _y = X, y
        if self.cat_cols is not None:
            from category_encoders.one_hot import OneHotEncoder
            enc = OneHotEncoder(cols=self.cat_cols, return_df=False, handle_unknown='ignore')
            enc.fit(X)
            _X = enc.transform(X)
        X_, y_ = _X, _y
        self.num_features = len(X_[0])
        Pop = []
        for l in range(1, self.length + 1):
            candidates = self.words.Generate(l)
            for cnddt in candidates:
                if self._validate_sequence(cnddt):
                    Pop.append(cnddt)

        def _eval(ppl):
            if self.couldBfirst == []:
                from sklearn.pipeline import Pipeline
            else:
                from imblearn.pipeline import Pipeline
            from sklearn.model_selection import RandomizedSearchCV
            if self.surrogates is None:
                from numpy import logspace
                from sklearn.gaussian_process import GaussianProcessRegressor
                from sklearn.kernel_ridge import KernelRidge
                from sklearn.gaussian_process.kernels import Matern, Sum, ExpSineSquared, WhiteKernel
                param_grid_gpr = {"alpha": logspace(-8, 1, 20),
                                  "kernel": [Sum(Matern(length_scale=l_, nu=p), WhiteKernel(noise_level=q))
                                             for l_ in logspace(-3, 3, 20)
                                             for p in [0.5, 1.5, 2.5]
                                             for q in logspace(-3, 1.5, 20)]}
                GPR = RandomizedSearchCV(GaussianProcessRegressor(), param_distributions=param_grid_gpr, n_iter=20,
                                         cv=2)
                param_grid_krr = {"alpha": logspace(-4, 0, 10),
                                  "kernel": [Sum(Matern(), ExpSineSquared(l_, p))
                                             for l_ in logspace(-2, 2, 20)
                                             for p in logspace(0, 2, 20)]}
                KRR = RandomizedSearchCV(KernelRidge(), param_distributions=param_grid_krr, n_iter=30, cv=2)
                self.surrogates = [(KRR, 35, CompactSample, 'L-BFGS-B'), (GPR, 50, BoxSample, 'L-BFGS-B')]
                self.min_random_evals = 10
            from collections import OrderedDict
            fitted = OrderedDict([])
            for seq in ppl:
                best_mdl, best_scr = self.optimize_pipeline(seq, X_, y_)
                if seq not in self.models:
                    self.models[seq] = (best_mdl, best_scr)
                if self.verbose > 0:
                    print("score:%f" % best_scr)
                    print(best_mdl)
                fitted[seq] = -best_scr
            return fitted

        num_parents = kwargs.pop('num_parents', 30)
        mutation_prob = kwargs.pop('mutation_prob', .1)
        _eoa = EOA(population=Pop, fitness=_eval, num_parents=num_parents, mutation_prob=mutation_prob,
                   term_genes=self.couldBlast, init_genes=self.couldBfirst, **kwargs)
        _eoa()
        self.best_estimator_ = list(self.get_top(1).items())[0][1][0]
        return self
Exemple #15
0
 X_test = transform_types_X(X_test)
 y_train, y_test = load_obj("y_train"), load_obj("y_test")
 encoder = load_obj("label_encoder")
 print("CHANGING COLUMN NAMES")
 X_train.columns = [
     "".join(c if c.isalnum() else "_" for c in str(x))
     for x in X_train.columns
 ]
 X_test.columns = [
     "".join(c if c.isalnum() else "_" for c in str(x))
     for x in X_test.columns
 ]
 if args.encoder == "CatBoost":
     cat_encoder = CatBoostEncoder()
 elif args.encoder == "OneHot":
     cat_encoder = OneHotEncoder()
 print("HACIENDO CATEGORICAL ENCODER")
 X_train = cat_encoder.fit_transform(X_train, y_train)
 X_test = cat_encoder.transform(X_test)
 print("FITTING STACKING")
 stacking.fit(X_train, y_train)
 save_obj(stacking, f"{args.name}")
 X_test, y_test = RandomUnderSampler(sampling_strategy={
     5: int(0.11 * 13526)
 }).fit_resample(X_test, y_test)
 preds = stacking.predict(X_test)
 save_obj(preds, f"{args.name}_preds")
 print(
     f"F1 SCORE {f1_score(y_test, preds , average='macro')}, F2 SCORE {fbeta_score(y_test, preds, average='macro', beta=2)},F05 SCORE {fbeta_score(y_test, preds, average='macro', beta=0.5)}, PRECISION IS {precision_score(y_test, preds, average='macro')},RECALL IS {recall_score(y_test, preds, average='macro')}, ACCURACY IS {accuracy_score(y_test, preds)}"
 )
 cm = confusion_matrix(y_test, preds, normalize="true")
Exemple #16
0
    def fit(self, X: X_TYPES, y: Y_TYPES):
        """Fit to data.

        Parameters
        ----------
        X: dict, list, tuple, np.ndarray or pd.DataFrame
            Feature set with shape=(n_samples, n_features).

        y: int, str or sequence
            - If int: Index of the target column in X.
            - If str: Name of the target column in X.
            - Else: Target column with shape=(n_samples,).

        Returns
        -------
        self: Encoder

        """
        X, y = self._prepare_input(X, y)
        self._cat_cols = X.select_dtypes(exclude="number")

        # Check Parameters
        if self.strategy.lower().endswith("encoder"):
            self.strategy = self.strategy[:-7]  # Remove the Encoder at the end
        if self.strategy.lower() not in ENCODING_STRATS:
            raise ValueError(
                f"Invalid value for the strategy parameter, got {self.strategy}. "
                f"Choose from: {', '.join(ENCODING_STRATS)}."
            )
        strategy = ENCODING_STRATS[self.strategy.lower()]

        if self.max_onehot is None:
            self.max_onehot = 0
        elif self.max_onehot < 0:  # if 0, 1 or 2: it never uses one-hot encoding
            raise ValueError(
                "Invalid value for the max_onehot parameter."
                f"Value should be >= 0, got {self.max_onehot}."
            )
        if self.frac_to_other:
            if self.frac_to_other <= 0 or self.frac_to_other >= 1:
                raise ValueError(
                    "Invalid value for the frac_to_other parameter. Value "
                    f"should be between 0 and 1, got {self.frac_to_other}."
                )

        self.log("Fitting Encoder...", 1)

        for col in X:
            self._to_other[col] = []
            if col in self._cat_cols:
                # Group uncommon classes into "other"
                if self.frac_to_other:
                    for category, count in X[col].value_counts().items():
                        if count < self.frac_to_other * len(X[col]):
                            self._to_other[col].append(category)
                            X[col] = X[col].replace(category, "other")

                # Count number of unique values in the column
                n_unique = len(X[col].unique())

                # Perform encoding type dependent on number of unique values
                if n_unique == 2:
                    self._encoders[col] = OrdinalEncoder(
                        dtype=np.int8,
                        handle_unknown="error",
                    ).fit(X[col].values.reshape(-1, 1))

                elif 2 < n_unique <= self.max_onehot:
                    self._encoders[col] = OneHotEncoder(
                        handle_missing="error",
                        handle_unknown="error",
                        use_cat_names=True,
                    ).fit(pd.DataFrame(X[col]))

                else:
                    self._encoders[col] = strategy(
                        handle_missing="error",
                        handle_unknown="error",
                        **self.kwargs,
                    ).fit(pd.DataFrame(X[col]), y)

        self._is_fitted = True
        return self
Exemple #17
0
    X_train, X_test = df[train_index], df[test_index]
    y_train, y_test = df[train_index], df[test_index]

train = df.loc[:, df.columns != 'DEMAND']
test = df['DEMAND']

# Scale Data

encoder = SumEncoder(cols=['FABRICATION'])

encoder = SumEncoder(cols=[
    'CHANNEL', 'STYLE', 'COLOR', 'INVENTORY_GROUP', 'GENDER_CATEGORY_DESC',
    'FABRICATION', 'SILHOUETTE'
])

encoder = OneHotEncoder(cols=['FABRICATION'])

encoder = OneHotEncoder(cols=[
    'CHANNEL', 'STYLE', 'COLOR', 'INVENTORY_GROUP', 'GENDER_CATEGORY_DESC',
    'FABRICATION'
])

train = encoder.fit_transform(train)

train = train.drop(['intercept'], axis=1)

test = test.values
test = test.reshape(-1, 1)
scaler = StandardScaler()
scaler.fit(test)
test = scaler.transform(test)
Exemple #18
0
    def _generate_features(self,
                           X,
                           y=None,
                           numeric_extra=None,
                           categorical_extra=None):
        try:
            self.feature_pipeline_

        except AttributeError:
            n_days = X['dayofweek'].nunique()
            n_hours = X['hour'].nunique()

            self.feature_pipeline_ = Pipeline([(
                'features',
                FeatureUnion([
                    # time of week part of TOWT
                    ('weeks',
                     Pipeline([
                         ('split',
                          FeatureUnion([
                              ('days',
                               Pipeline([
                                   ('select', ColumnSelector('dayofweek')),
                                   ('ordinal',
                                    OrdinalEncoder(cols=['dayofweek'],
                                                   return_df=False)),
                                   ('unknown',
                                    SimpleImputer(missing_values=-1,
                                                  strategy='most_frequent'))
                               ])),
                              ('hours',
                               Pipeline([('select', ColumnSelector('hour')),
                                         ('ordinal',
                                          OrdinalEncoder(cols=['hour'],
                                                         return_df=False)),
                                         ('unknown',
                                          SimpleImputer(
                                              missing_values=-1,
                                              strategy='most_frequent'))]))
                          ])),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek', 'hour']))),
                         ('term',
                          PatsyTransformer('-1 + C(dayofweek):C(hour)'))
                     ])) if (n_days > 1) and (n_hours > 1) else
                    ('days',
                     Pipeline([
                         ('select', ColumnSelector('dayofweek')),
                         ('ordinal',
                          OrdinalEncoder(cols=['dayofweek'], return_df=False)),
                         ('unknown',
                          SimpleImputer(missing_values=-1,
                                        strategy='most_frequent')),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek']))),
                         ('one_hot',
                          OneHotEncoder(cols=['dayofweek'], return_df=False))
                     ])) if n_days > 1 else
                    ('hours',
                     Pipeline(
                         [('select', ColumnSelector('hour')),
                          ('ordinal',
                           OrdinalEncoder(cols=['hour'], return_df=False)),
                          ('unknown',
                           SimpleImputer(missing_values=-1,
                                         strategy='most_frequent')),
                          ('to_pandas',
                           FunctionTransformer(
                               lambda x: pd.DataFrame(x, columns=['hour']))),
                          ('one_hot',
                           OneHotEncoder(cols=['hour'], return_df=False))])),

                    # temperature part of TOWT
                    ('temperature',
                     ColumnTransformer([
                         ('encode_temperature',
                          IntervalEncoder(
                              n_chunks=10,
                              span=0.1 * X[self.temperature_col].std(),
                              method='normal'), [self.temperature_col])
                     ])),
                    ('temperature_interact',
                     'drop' if n_hours == 1 else Pipeline(
                         [('split',
                           FeatureUnion([
                               ('temperature_part',
                                Pipeline([
                                    ('select',
                                     ColumnSelector(self.temperature_col)),
                                    (
                                        'create_bins',
                                        KBinsDiscretizer(
                                            n_bins=self.n_bins_temperature,
                                            strategy='quantile',
                                            encode='ordinal'),
                                    )
                                ])),
                               ('hour_part',
                                Pipeline([('select', ColumnSelector('hour')),
                                          ('ordinal',
                                           OrdinalEncoder(cols=['hour'],
                                                          return_df=False)),
                                          ('unknown',
                                           SimpleImputer(
                                               missing_values=-1,
                                               strategy='most_frequent'))]))
                           ])),
                          ('to_pandas',
                           FunctionTransformer(lambda x: pd.DataFrame(
                               x, columns=[self.temperature_col, 'hour']))),
                          ('term',
                           PatsyTransformer(
                               f'-1 + C({self.temperature_col}):C(hour)'))])),

                    # deal with extra numerical regressors
                    ('numerical_regressors',
                     'drop' if not numeric_extra else ColumnTransformer(
                         [(f'encode_{col}',
                           IntervalEncoder(n_chunks=4,
                                           span=0.1 * X[col].std(),
                                           method='normal'), [col])
                          for col in numeric_extra])),

                    # deal with extra categorical regressors
                    ('categorical_regressors', 'drop' if not categorical_extra
                     else TargetEncoder(cols=categorical_extra,
                                        return_df=False,
                                        handle_missing='value',
                                        handle_unknown='value'))
                ]))])
            # Fit the pipeline
            self.feature_pipeline_.fit(X, y)

        finally:
            return self.feature_pipeline_.transform(X)
    if df[col].dtype == object:
        imp = SimpleImputer(strategy='most_frequent')
        df[col] = imp.fit_transform(df[[col]])
    else:
        imp = SimpleImputer(strategy='mean')
        df[col] = imp.fit_transform(df[[col]])

## Analysing the Data
# my_report = sweetviz.analyze([df,'Train'], target_feat= 'G3')
# my_report.show_html()

## Scaling and Encoding the data
for colum in df.columns:
    if df[colum].dtype == object:
        # print(colum , df[colum].unique().tolist())
        df[colum] = OneHotEncoder().fit_transform(df[colum])

columns = df.columns
df = MinMaxScaler().fit_transform(df)
df = pd.DataFrame(df, columns=columns)

## Finding the Correlations between Features
# sns.heatmap(df.corr(), fmt = '.1f',annot = True)
# plt.show()

correlations = df.corr()['SalePrice'].drop('SalePrice')

# print(correlations)
# print(correlations.quantile(.25))
# print(correlations.quantile(.75))
# print(correlations.quantile(.50))
Exemple #20
0
def load():
    train = pd.read_csv("/kaggle/input/google-quest-challenge/train.csv")
    test = pd.read_csv("/kaggle/input/google-quest-challenge/test.csv")

    target_cols = [
        'question_asker_intent_understanding', 'question_body_critical',
        'question_conversational', 'question_expect_short_answer',
        'question_fact_seeking', 'question_has_commonly_accepted_answer',
        'question_interestingness_others', 'question_interestingness_self',
        'question_multi_intent', 'question_not_really_a_question',
        'question_opinion_seeking', 'question_type_choice',
        'question_type_compare', 'question_type_consequence',
        'question_type_definition', 'question_type_entity',
        'question_type_instructions', 'question_type_procedure',
        'question_type_reason_explanation', 'question_type_spelling',
        'question_well_written', 'answer_helpful',
        'answer_level_of_information', 'answer_plausible', 'answer_relevance',
        'answer_satisfaction', 'answer_type_instructions',
        'answer_type_procedure', 'answer_type_reason_explanation',
        'answer_well_written'
    ]

    data_cols = ['question_title', 'question_body', 'answer', 'category']

    y_train = train[target_cols].copy()
    print(type(y_train))
    x_train = train[data_cols].copy()
    del train

    x_test = test.copy()
    del test

    question_body_doc2vec = MyDoc2Vec()
    answer_doc3vec = MyDoc2Vec()

    x_train_question_vec = question_body_doc2vec.fit_transform(
        x_train['question_body'])
    x_test_question_vec = question_body_doc2vec.transform(
        x_test['question_body'])
    x_train_answer_vec = question_body_doc2vec.fit_transform(x_train['answer'])
    x_test_answer_vec = question_body_doc2vec.transform(x_test['answer'])

    print(x_train_question_vec.shape)

    text_encoder = Pipeline(
        [('Text-TF-IDF', TfidfVectorizer(ngram_range=(1, 1))),
         ('Text-SVD', TruncatedSVD(n_components=100))],
        verbose=True)

    ohe = OneHotEncoder(cols=['category'])

    preprocessor = ColumnTransformer([
        ('Q-T', text_encoder, 'question_title'),
        ('Q-B', text_encoder, 'question_body'),
        ('A', text_encoder, 'answer'),
        ('Category', ohe, 'category'),
    ])

    x_train = preprocessor.fit_transform(x_train).astype(np.float32)
    x_test = preprocessor.transform(x_test).astype(np.float32)
    y_train = y_train.values.astype(np.float32)

    x_train = np.concatenate(
        [x_train, x_train_question_vec, x_train_answer_vec], axis=1)
    x_test = np.concatenate([x_test, x_test_question_vec, x_test_answer_vec],
                            axis=1)

    return x_train, y_train, x_test
Exemple #21
0
def create_classification_pipeline(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

    numerical_indexes = np.array([0, 1, 2, 3])
    non_numerical_indexes = np.array([], int)
    ordinal_indexes_after_handle_missing_values = np.array([], int)
    one_hot_indexes_after_handle_missing_values = np.array([], int)

    pipeline = Pipeline(steps=[
        (
            "handle_missing_values",
            ColumnTransformer(
                [
                    ("imputer_mean", SimpleImputer(strategy="mean"),
                     numerical_indexes),
                    (
                        "imputer_mode",
                        SimpleImputer(strategy="most_frequent"),
                        non_numerical_indexes,
                    ),
                ],
                remainder="drop",
            ),
        ),
        (
            "handle_categorical_features",
            ColumnTransformer(
                [
                    (
                        "feature_encoder_ordinal",
                        OrdinalEncoder(),
                        ordinal_indexes_after_handle_missing_values,
                    ),
                    (
                        "feature_encoder_onehot",
                        OneHotEncoder(),
                        one_hot_indexes_after_handle_missing_values,
                    ),
                ],
                remainder="passthrough",
            ),
        ),
        (
            "estimator",
            LogisticRegression(
                solver="liblinear",
                penalty="l2",
                C=1.0,
                fit_intercept=True,
                class_weight=None,
                max_iter=100,
                multi_class="auto",
            ),
        ),
    ])

    _ = pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)

    return {
        'features_train': X_train,
        'features_test': X_test,
        'target_train': y_train,
        'target_test': y_test,
        'target_predicted': y_pred,
        'target_probability': y_prob,
        'classification_pipeline': pipeline
    }
def enc(X):
    e = CEOneHotEncoder(use_cat_names=True, handle_unknown='ignore').fit(X)
    return e.transform(X)
Exemple #23
0
    text_encoder = Pipeline(
        [('Text-TF-IDF', TfidfVectorizer(ngram_range=(1, 3))),
         ('Text-SVD', TruncatedSVD(n_components=100))],
        verbose=True)

    #Encode 'url'
    # gives part of string (URL) before '.'
    before_dot = re.compile('^[^.]*')

    def transform_url(x):
        return x.apply(lambda v: re.findall(before_dot, urlparse(v).netloc)[0])

    url_encoder = Pipeline(
        [('URL-transformer', FunctionTransformer(transform_url,
                                                 validate=False)),
         ('URL-OHE', OneHotEncoder(drop_invariant=True))],
        verbose=True)
    #Encode 'category'
    ohe = OneHotEncoder(cols='category', drop_invariant=True)
    #Transform
    preprocessor = ColumnTransformer([('Q-T', text_encoder, 'question_title'),
                                      ('Q-B', text_encoder, 'question_body'),
                                      ('A', text_encoder, 'answer'),
                                      ('URL', url_encoder, 'url'),
                                      ('Categoty', ohe, 'category')],
                                     verbose=True)

    x_train = preprocessor.fit_transform(x_train)
    x_test = preprocessor.transform(x_test)

    print(x_train.shape)