def one_hot_encode(self, dataframe): one_cols = [ 'events', 'zone', 'pitch_type', 'type', 'home_team', 'away_team', 'pitch_count', 'L1_pitch_type', 'L1_pitch_result', 'L1_pitch_zone', '_count', 'count_cat', 'pitch_cat', 'balls', 'strikes','inning', 'outs_when_up', 'batting_order_slot', 'pitch_subtype', 'count_status' ] one_hot_cols = [] for col in one_cols: if col in dataframe.columns.tolist(): one_hot_cols.append(col) # Instantiate Encoder one_hot_encoder = OneHotEncoder(cols=one_hot_cols, return_df=True, use_cat_names=True) # Encode features encoded = one_hot_encoder.fit_transform(dataframe[one_hot_cols], dataframe['next_pitch']) # Join encoded features into df and drop old columns dataframe = dataframe.join(encoded).drop(columns = one_hot_cols) return dataframe
def fit(self, X, y): """ Generates and optimizes all legitimate pipelines. The best pipeline can be retrieved from `self.best_estimator_` :param X: Training data :param y: Corresponding observations :return: `self` """ _X, _y = X, y if self.cat_cols is not None: from category_encoders.one_hot import OneHotEncoder enc = OneHotEncoder( cols=self.cat_cols, return_df=False, handle_unknown="ignore" ) enc.fit(X) _X = enc.transform(X) X_, y_ = _X, _y self.num_features = len(X_[0]) for l in range(1, self.length + 1): self._cast(l, X_, y_) self.best_estimator_ = list(self.get_top(1).items())[0][1][0] self.best_estimator_score = list(self.get_top(1).items())[0][1][1] return self
def encode_genotypes(df): """One-hot encode the genotypes :param df: A DataFrame of samples with genotypes as columns :type df: pandas DataFrame :return: pandas DataFrame of one-hot encoded columns for genotypes and OHE instance :rtype: pandas DataFrame, OneHotEncoder instance """ ohe = OneHotEncoder(cols=df.columns, handle_missing="return_nan") X = ohe.fit_transform(df) return pd.DataFrame(X, index=df.index), ohe
def transform_dataset(dataset: pd.DataFrame): enc = OneHotEncoder() cleaned_dataset = dataset.replace({ 'yes': 1, 'no': 0, 'success': 1, 'failure': 0, 'unknown': np.nan, 'other': np.nan, }) transformed = enc.fit_transform(cleaned_dataset) print(transformed) return transformed
def build_pipeline(self): """ Makes a pipeline based on data_config This is because autosklearn does not perform automatic data encoding """ categorical_list = infer_categoricals(self.X) preprocessing_steps = [] if self.data_config.get("text_columns"): print("Applying TFIDF to text columns: {data_config.get('text_columns')}") preprocessing_steps.append(make_pipeline( ColumnSelector(cols=data_config.get("text_columns"), drop_axis=True), TfidfVectorizer() )) categorical_list = [c for c in categorical_list if c not in data_config["text_columns"]] if categorical_list: print(f"Applying One Hot Encoding to categorical columns: {categorical_list}") preprocessing_steps.append(make_pipeline( ColumnSelector(cols=categorical_list), OneHotEncoder(handle_unknown="impute") )) if preprocessing_steps: preprocessing_steps = make_union(*preprocessing_steps) preprocessing_steps = make_pipeline(preprocessing_steps, SimpleImputer()) else: preprocessing_steps = SimpleImputer() if self.problem_type == "classification": automl = TPOTClassifier(**self.automl_settings) else: automl = TPOTRegressor(**self.automl_settings) automl_pipeline = make_pipeline( preprocessing_steps, automl ) return automl_pipeline
def onehot_or_targ(X, y, categorical, k): ''' Returns the X, y with encoded categorical variables based on a threshold value k. Parameters: ----------- X: pd.DataFrame Contains the dataframe of a given dataset excluding its target column. y: pd.Series Contains the series of the target of a given dataset. categorical: list Contains the names of the categorical columns. k: int Contains threshold value to determine whether to perform target encoding or one-hot encoding. Returns: -------- pd.DataFrame, pd.Series Contains an updated pd.DataFrame with encoding of categorical features, contains an updated pd.Series with encoding of a categorical target. ''' for column in categorical: if len(X[column].unique()) > k: if X[column].dtype.name == 'category': X[column] = X[column].cat.codes if y.dtype.name == 'category': y = y.cat.codes X = TargetEncoder(cols=[column]).fit_transform(X, y) else: X = OneHotEncoder(cols=[column]).fit_transform(X) return X, y
def one_hot_encode(self): one_hot_cols = [ 'events', 'zone', 'pitch_type', 'type', 'home_team', 'away_team', 'pitch_count', 'L1_pitch_type', 'L1_pitch_result', 'L1_pitch_zone', '_count', 'count_cat', 'pitch_cat', 'balls', 'strikes','inning', 'outs_when_up', 'batting_order_slot' ] # Instantiate Encoder one_hot_encoder = OneHotEncoder(cols=one_hot_cols, return_df=True, use_cat_names=True) # Encode features encoded = one_hot_encoder.fit_transform(self.df[one_hot_cols], self.df['next_pitch']) # Join encoded features into df and drop old columns self.df = self.df.join(encoded).drop(columns = one_hot_cols + ['events_0'])
def create_regression_pipeline(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) numerical_indexes = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) non_numerical_indexes = np.array([], int) one_hot_indexes_after_handle_missing_values = np.array([], int) ordinal_indexes_after_handle_missing_values = np.array([], int) pipeline = Pipeline(steps=[ ( "handle_missing_values", ColumnTransformer( [ ("imputer_mean", SimpleImputer(strategy="mean"), numerical_indexes), ( "imputer_mode", SimpleImputer(strategy="most_frequent"), non_numerical_indexes, ), ], remainder="drop", ), ), ( "handle categorical features", ColumnTransformer( [ ( "feature_encoder_ordinal", OrdinalEncoder(), ordinal_indexes_after_handle_missing_values, ), ( "feature_encoder_onehot", OneHotEncoder(), one_hot_indexes_after_handle_missing_values, ), ], remainder="passthrough", ), ), ("estimator", LinearRegression(fit_intercept=True)), ]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) return { 'features_train': X_train, 'features_test': X_test, 'target_train': y_train, 'target_test': y_test, 'target_predicted': y_pred, 'regression_pipeline': pipeline }
def _perform_categ_fit(self, df, y): # https://github.com/scikit-learn-contrib/categorical-encoding # https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/ # https://en.wikipedia.org/wiki/Feature_hashing#Feature_vectorization_using_the_hashing_trick categ_cols = {} onehot_cols = [] for col in self.categorical_vars: categs = df[col].astype( pd.api.types.CategoricalDtype()).cat.categories if self.categ_enc_method == "onehot": card = df[col].nunique() if card > 10: print("Warning, cardinality of {} = {}".format(col, card)) onehot_cols.append(col) elif self.categ_enc_method == "target": if self.tfs_list["y"] is None: raise Exception( "You have to pass your target variable to the fit() " "function for target encoding") # Mean/target/likelihood encoding target_col_name = self.tfs_list["y"].name df_enc = df.copy() df_enc[target_col_name] = self.tfs_list["y"] cumsum = df_enc.groupby( col)[target_col_name].cumsum() - df_enc[target_col_name] cumcnt = df_enc.groupby(col)[target_col_name].cumcount() means = cumsum / cumcnt means.rename('mean_enc', inplace=True) mean_enc = pd.Series(means, index=self.tfs_list["y"]).to_dict() global_mean = self.tfs_list["y"].mean() categ_cols[col] = {"target": (global_mean, mean_enc)} elif self.categ_enc_method == "hashing": str_hashs = [col + "=" + str(val) for val in categs] hashs = [hash(h) % self.hash_space for h in str_hashs] categ_cols[col] = {"hashing": hashs} if len(onehot_cols) > 0: enc = CategOneHot(cols=onehot_cols, handle_unknown='impute') enc.fit(df) self.tfs_list["onehot"] = enc self.tfs_list["categ_cols"] = categ_cols
def one_hot_encoder(self, df, configger): """ :param df: the train dataset. :param configger: the json str of configger setting, the params means: verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). use_cat_names: bool if True, category values will be included in the encoded column names. Since this can result in duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated. If False, category indices will be used instead of the category values. handle_unknown: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, an extra column will be added in if the transform matrix has nan values. This can cause unexpected changes in dimension in some cases. :return: the transform result """ X, y, encode_col = self.get_Xy(df, configger) drop_invariant = set_default_vale("drop_invariant", configger, False, is_bool=True) handle_missing = set_default_vale("handle_missing", configger, "value") handle_unknown = set_default_vale("handle_unknown", configger, "value") use_cat_names = set_default_vale("use_cat_names", configger, False, is_bool=True) encoder = OneHotEncoder(verbose=1, cols=encode_col, drop_invariant=drop_invariant, return_df=True, use_cat_names=use_cat_names, handle_unknown=handle_unknown, handle_missing=handle_missing) res = encoder.fit_transform(X, y) return res
def get_single_encoder(encoder_name: str, cat_cols: list): """ Get encoder by its name :param encoder_name: Name of desired encoder :param cat_cols: Cat columns for encoding :return: Categorical encoder """ if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "OneHotEncoder": encoder = OneHotEncoder(cols=cat_cols) if encoder is None: raise NotImplementedError("To be implemented") return encoder
def get_single_encoder(encoder_name: str, cat_cols: list): if encoder_name == "FrequencyEncoder": encoder = FrequencyEncoder(cols=cat_cols) if encoder_name == "WOEEncoder": encoder = WOEEncoder(cols=cat_cols) if encoder_name == "TargetEncoder": encoder = TargetEncoder(cols=cat_cols) if encoder_name == "SumEncoder": encoder = SumEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == "LeaveOneOutEncoder": encoder = LeaveOneOutEncoder(cols=cat_cols) if encoder_name == "HelmertEncoder": encoder = HelmertEncoder(cols=cat_cols) if encoder_name == "BackwardDifferenceEncoder": encoder = BackwardDifferenceEncoder(cols=cat_cols) if encoder_name == "JamesSteinEncoder": encoder = JamesSteinEncoder(cols=cat_cols) if encoder_name == "OrdinalEncoder": encoder = OrdinalEncoder(cols=cat_cols) if encoder_name == "CatBoostEncoder": encoder = CatBoostEncoder(cols=cat_cols) if encoder_name == "MEstimateEncoder": encoder = MEstimateEncoder(cols=cat_cols) if encoder_name == 'OneHotEncoder': encoder = OneHotEncoder(cols=cat_cols) # assert encoder is not None return encoder
def read_data(self, data_path, y_col, index_col=None, skip_cols=None, test_size=0.3): """ Read a csv file with data. Categorical variables will be encoded according to the one hot scheme. Parameters ---------- data_path : str Path to the csv file with data. y_col : str Ground truth labels with values of 0 and 1. index_col : int, sequence or bool, optional Column to use as the row labels of the DataFrame. If a sequence is given, MultiIndex is used. skip_cols : list List of features / columns to exclude from the data. test_size : float Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. Returns ------- data_splits : list, length = 4 List containing train-test split. """ data = pd.read_csv(data_path, index_col=index_col, na_values=['NONE', 'na']) # drop columns with where N/As constitute around 10% of all entries na_max_percent = 0.1 nas = data.isna().sum() excessive_na_cols = set(nas[nas > na_max_percent * len(data)].index) excessive_na_cols = excessive_na_cols.union(set(skip_cols)) data_cols = set(data.columns).difference(excessive_na_cols) if y_col not in data_cols: raise ValueError(f'Too many enties without the labels {y_col}') numeric_cols = set( data._get_numeric_data().columns).difference(excessive_na_cols) # since y_col contains 0, 1 it should be numeric categorical_cols = data_cols - numeric_cols numeric_cols.remove(y_col) data = data.loc[:, data_cols] data = data.dropna() X = data.loc[:, numeric_cols.union(categorical_cols)] y = data.is_bad.values # encode categorical variables encoder = OneHotEncoder(cols=categorical_cols, use_cat_names=True) X = encoder.fit_transform(X) data_splits = train_test_split(X, y, test_size=test_size, random_state=self._random_state) return data_splits
def eoa_fit(self, X, y, **kwargs): """ Applies evolutionary optimization methods to find an optimum pipeline :param X: Training data :param y: Corresponding observations :param kwargs: `EOA` parameters :return: `self` """ from .structsearch import BoxSample, CompactSample from .eoa import EOA _X, _y = X, y if self.cat_cols is not None: from category_encoders.one_hot import OneHotEncoder enc = OneHotEncoder(cols=self.cat_cols, return_df=False, handle_unknown='ignore') enc.fit(X) _X = enc.transform(X) X_, y_ = _X, _y self.num_features = len(X_[0]) Pop = [] for l in range(1, self.length + 1): candidates = self.words.Generate(l) for cnddt in candidates: if self._validate_sequence(cnddt): Pop.append(cnddt) def _eval(ppl): if self.couldBfirst == []: from sklearn.pipeline import Pipeline else: from imblearn.pipeline import Pipeline from sklearn.model_selection import RandomizedSearchCV if self.surrogates is None: from numpy import logspace from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.kernel_ridge import KernelRidge from sklearn.gaussian_process.kernels import Matern, Sum, ExpSineSquared, WhiteKernel param_grid_gpr = {"alpha": logspace(-8, 1, 20), "kernel": [Sum(Matern(length_scale=l_, nu=p), WhiteKernel(noise_level=q)) for l_ in logspace(-3, 3, 20) for p in [0.5, 1.5, 2.5] for q in logspace(-3, 1.5, 20)]} GPR = RandomizedSearchCV(GaussianProcessRegressor(), param_distributions=param_grid_gpr, n_iter=20, cv=2) param_grid_krr = {"alpha": logspace(-4, 0, 10), "kernel": [Sum(Matern(), ExpSineSquared(l_, p)) for l_ in logspace(-2, 2, 20) for p in logspace(0, 2, 20)]} KRR = RandomizedSearchCV(KernelRidge(), param_distributions=param_grid_krr, n_iter=30, cv=2) self.surrogates = [(KRR, 35, CompactSample, 'L-BFGS-B'), (GPR, 50, BoxSample, 'L-BFGS-B')] self.min_random_evals = 10 from collections import OrderedDict fitted = OrderedDict([]) for seq in ppl: best_mdl, best_scr = self.optimize_pipeline(seq, X_, y_) if seq not in self.models: self.models[seq] = (best_mdl, best_scr) if self.verbose > 0: print("score:%f" % best_scr) print(best_mdl) fitted[seq] = -best_scr return fitted num_parents = kwargs.pop('num_parents', 30) mutation_prob = kwargs.pop('mutation_prob', .1) _eoa = EOA(population=Pop, fitness=_eval, num_parents=num_parents, mutation_prob=mutation_prob, term_genes=self.couldBlast, init_genes=self.couldBfirst, **kwargs) _eoa() self.best_estimator_ = list(self.get_top(1).items())[0][1][0] return self
X_test = transform_types_X(X_test) y_train, y_test = load_obj("y_train"), load_obj("y_test") encoder = load_obj("label_encoder") print("CHANGING COLUMN NAMES") X_train.columns = [ "".join(c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns ] X_test.columns = [ "".join(c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns ] if args.encoder == "CatBoost": cat_encoder = CatBoostEncoder() elif args.encoder == "OneHot": cat_encoder = OneHotEncoder() print("HACIENDO CATEGORICAL ENCODER") X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) print("FITTING STACKING") stacking.fit(X_train, y_train) save_obj(stacking, f"{args.name}") X_test, y_test = RandomUnderSampler(sampling_strategy={ 5: int(0.11 * 13526) }).fit_resample(X_test, y_test) preds = stacking.predict(X_test) save_obj(preds, f"{args.name}_preds") print( f"F1 SCORE {f1_score(y_test, preds , average='macro')}, F2 SCORE {fbeta_score(y_test, preds, average='macro', beta=2)},F05 SCORE {fbeta_score(y_test, preds, average='macro', beta=0.5)}, PRECISION IS {precision_score(y_test, preds, average='macro')},RECALL IS {recall_score(y_test, preds, average='macro')}, ACCURACY IS {accuracy_score(y_test, preds)}" ) cm = confusion_matrix(y_test, preds, normalize="true")
def fit(self, X: X_TYPES, y: Y_TYPES): """Fit to data. Parameters ---------- X: dict, list, tuple, np.ndarray or pd.DataFrame Feature set with shape=(n_samples, n_features). y: int, str or sequence - If int: Index of the target column in X. - If str: Name of the target column in X. - Else: Target column with shape=(n_samples,). Returns ------- self: Encoder """ X, y = self._prepare_input(X, y) self._cat_cols = X.select_dtypes(exclude="number") # Check Parameters if self.strategy.lower().endswith("encoder"): self.strategy = self.strategy[:-7] # Remove the Encoder at the end if self.strategy.lower() not in ENCODING_STRATS: raise ValueError( f"Invalid value for the strategy parameter, got {self.strategy}. " f"Choose from: {', '.join(ENCODING_STRATS)}." ) strategy = ENCODING_STRATS[self.strategy.lower()] if self.max_onehot is None: self.max_onehot = 0 elif self.max_onehot < 0: # if 0, 1 or 2: it never uses one-hot encoding raise ValueError( "Invalid value for the max_onehot parameter." f"Value should be >= 0, got {self.max_onehot}." ) if self.frac_to_other: if self.frac_to_other <= 0 or self.frac_to_other >= 1: raise ValueError( "Invalid value for the frac_to_other parameter. Value " f"should be between 0 and 1, got {self.frac_to_other}." ) self.log("Fitting Encoder...", 1) for col in X: self._to_other[col] = [] if col in self._cat_cols: # Group uncommon classes into "other" if self.frac_to_other: for category, count in X[col].value_counts().items(): if count < self.frac_to_other * len(X[col]): self._to_other[col].append(category) X[col] = X[col].replace(category, "other") # Count number of unique values in the column n_unique = len(X[col].unique()) # Perform encoding type dependent on number of unique values if n_unique == 2: self._encoders[col] = OrdinalEncoder( dtype=np.int8, handle_unknown="error", ).fit(X[col].values.reshape(-1, 1)) elif 2 < n_unique <= self.max_onehot: self._encoders[col] = OneHotEncoder( handle_missing="error", handle_unknown="error", use_cat_names=True, ).fit(pd.DataFrame(X[col])) else: self._encoders[col] = strategy( handle_missing="error", handle_unknown="error", **self.kwargs, ).fit(pd.DataFrame(X[col]), y) self._is_fitted = True return self
X_train, X_test = df[train_index], df[test_index] y_train, y_test = df[train_index], df[test_index] train = df.loc[:, df.columns != 'DEMAND'] test = df['DEMAND'] # Scale Data encoder = SumEncoder(cols=['FABRICATION']) encoder = SumEncoder(cols=[ 'CHANNEL', 'STYLE', 'COLOR', 'INVENTORY_GROUP', 'GENDER_CATEGORY_DESC', 'FABRICATION', 'SILHOUETTE' ]) encoder = OneHotEncoder(cols=['FABRICATION']) encoder = OneHotEncoder(cols=[ 'CHANNEL', 'STYLE', 'COLOR', 'INVENTORY_GROUP', 'GENDER_CATEGORY_DESC', 'FABRICATION' ]) train = encoder.fit_transform(train) train = train.drop(['intercept'], axis=1) test = test.values test = test.reshape(-1, 1) scaler = StandardScaler() scaler.fit(test) test = scaler.transform(test)
def _generate_features(self, X, y=None, numeric_extra=None, categorical_extra=None): try: self.feature_pipeline_ except AttributeError: n_days = X['dayofweek'].nunique() n_hours = X['hour'].nunique() self.feature_pipeline_ = Pipeline([( 'features', FeatureUnion([ # time of week part of TOWT ('weeks', Pipeline([ ('split', FeatureUnion([ ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')) ])), ('hours', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek', 'hour']))), ('term', PatsyTransformer('-1 + C(dayofweek):C(hour)')) ])) if (n_days > 1) and (n_hours > 1) else ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek']))), ('one_hot', OneHotEncoder(cols=['dayofweek'], return_df=False)) ])) if n_days > 1 else ('hours', Pipeline( [('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer( lambda x: pd.DataFrame(x, columns=['hour']))), ('one_hot', OneHotEncoder(cols=['hour'], return_df=False))])), # temperature part of TOWT ('temperature', ColumnTransformer([ ('encode_temperature', IntervalEncoder( n_chunks=10, span=0.1 * X[self.temperature_col].std(), method='normal'), [self.temperature_col]) ])), ('temperature_interact', 'drop' if n_hours == 1 else Pipeline( [('split', FeatureUnion([ ('temperature_part', Pipeline([ ('select', ColumnSelector(self.temperature_col)), ( 'create_bins', KBinsDiscretizer( n_bins=self.n_bins_temperature, strategy='quantile', encode='ordinal'), ) ])), ('hour_part', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=[self.temperature_col, 'hour']))), ('term', PatsyTransformer( f'-1 + C({self.temperature_col}):C(hour)'))])), # deal with extra numerical regressors ('numerical_regressors', 'drop' if not numeric_extra else ColumnTransformer( [(f'encode_{col}', IntervalEncoder(n_chunks=4, span=0.1 * X[col].std(), method='normal'), [col]) for col in numeric_extra])), # deal with extra categorical regressors ('categorical_regressors', 'drop' if not categorical_extra else TargetEncoder(cols=categorical_extra, return_df=False, handle_missing='value', handle_unknown='value')) ]))]) # Fit the pipeline self.feature_pipeline_.fit(X, y) finally: return self.feature_pipeline_.transform(X)
if df[col].dtype == object: imp = SimpleImputer(strategy='most_frequent') df[col] = imp.fit_transform(df[[col]]) else: imp = SimpleImputer(strategy='mean') df[col] = imp.fit_transform(df[[col]]) ## Analysing the Data # my_report = sweetviz.analyze([df,'Train'], target_feat= 'G3') # my_report.show_html() ## Scaling and Encoding the data for colum in df.columns: if df[colum].dtype == object: # print(colum , df[colum].unique().tolist()) df[colum] = OneHotEncoder().fit_transform(df[colum]) columns = df.columns df = MinMaxScaler().fit_transform(df) df = pd.DataFrame(df, columns=columns) ## Finding the Correlations between Features # sns.heatmap(df.corr(), fmt = '.1f',annot = True) # plt.show() correlations = df.corr()['SalePrice'].drop('SalePrice') # print(correlations) # print(correlations.quantile(.25)) # print(correlations.quantile(.75)) # print(correlations.quantile(.50))
def load(): train = pd.read_csv("/kaggle/input/google-quest-challenge/train.csv") test = pd.read_csv("/kaggle/input/google-quest-challenge/test.csv") target_cols = [ 'question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written' ] data_cols = ['question_title', 'question_body', 'answer', 'category'] y_train = train[target_cols].copy() print(type(y_train)) x_train = train[data_cols].copy() del train x_test = test.copy() del test question_body_doc2vec = MyDoc2Vec() answer_doc3vec = MyDoc2Vec() x_train_question_vec = question_body_doc2vec.fit_transform( x_train['question_body']) x_test_question_vec = question_body_doc2vec.transform( x_test['question_body']) x_train_answer_vec = question_body_doc2vec.fit_transform(x_train['answer']) x_test_answer_vec = question_body_doc2vec.transform(x_test['answer']) print(x_train_question_vec.shape) text_encoder = Pipeline( [('Text-TF-IDF', TfidfVectorizer(ngram_range=(1, 1))), ('Text-SVD', TruncatedSVD(n_components=100))], verbose=True) ohe = OneHotEncoder(cols=['category']) preprocessor = ColumnTransformer([ ('Q-T', text_encoder, 'question_title'), ('Q-B', text_encoder, 'question_body'), ('A', text_encoder, 'answer'), ('Category', ohe, 'category'), ]) x_train = preprocessor.fit_transform(x_train).astype(np.float32) x_test = preprocessor.transform(x_test).astype(np.float32) y_train = y_train.values.astype(np.float32) x_train = np.concatenate( [x_train, x_train_question_vec, x_train_answer_vec], axis=1) x_test = np.concatenate([x_test, x_test_question_vec, x_test_answer_vec], axis=1) return x_train, y_train, x_test
def create_classification_pipeline(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) numerical_indexes = np.array([0, 1, 2, 3]) non_numerical_indexes = np.array([], int) ordinal_indexes_after_handle_missing_values = np.array([], int) one_hot_indexes_after_handle_missing_values = np.array([], int) pipeline = Pipeline(steps=[ ( "handle_missing_values", ColumnTransformer( [ ("imputer_mean", SimpleImputer(strategy="mean"), numerical_indexes), ( "imputer_mode", SimpleImputer(strategy="most_frequent"), non_numerical_indexes, ), ], remainder="drop", ), ), ( "handle_categorical_features", ColumnTransformer( [ ( "feature_encoder_ordinal", OrdinalEncoder(), ordinal_indexes_after_handle_missing_values, ), ( "feature_encoder_onehot", OneHotEncoder(), one_hot_indexes_after_handle_missing_values, ), ], remainder="passthrough", ), ), ( "estimator", LogisticRegression( solver="liblinear", penalty="l2", C=1.0, fit_intercept=True, class_weight=None, max_iter=100, multi_class="auto", ), ), ]) _ = pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) y_prob = pipeline.predict_proba(X_test) return { 'features_train': X_train, 'features_test': X_test, 'target_train': y_train, 'target_test': y_test, 'target_predicted': y_pred, 'target_probability': y_prob, 'classification_pipeline': pipeline }
def enc(X): e = CEOneHotEncoder(use_cat_names=True, handle_unknown='ignore').fit(X) return e.transform(X)
text_encoder = Pipeline( [('Text-TF-IDF', TfidfVectorizer(ngram_range=(1, 3))), ('Text-SVD', TruncatedSVD(n_components=100))], verbose=True) #Encode 'url' # gives part of string (URL) before '.' before_dot = re.compile('^[^.]*') def transform_url(x): return x.apply(lambda v: re.findall(before_dot, urlparse(v).netloc)[0]) url_encoder = Pipeline( [('URL-transformer', FunctionTransformer(transform_url, validate=False)), ('URL-OHE', OneHotEncoder(drop_invariant=True))], verbose=True) #Encode 'category' ohe = OneHotEncoder(cols='category', drop_invariant=True) #Transform preprocessor = ColumnTransformer([('Q-T', text_encoder, 'question_title'), ('Q-B', text_encoder, 'question_body'), ('A', text_encoder, 'answer'), ('URL', url_encoder, 'url'), ('Categoty', ohe, 'category')], verbose=True) x_train = preprocessor.fit_transform(x_train) x_test = preprocessor.transform(x_test) print(x_train.shape)