def mca_benchmark(name, x, clf): warnings.simplefilter(action='ignore', category=FutureWarning) averages = [] n_components = range(1, 5) for n_component in n_components: mca = MCA(n_components=n_component) transformed = mca.fit_transform(x).values predicted_labels = clf.fit_predict(transformed) silhouette_avg = silhouette_score(transformed, predicted_labels) averages.append(silhouette_avg) lb = np.min(averages) ub = np.max(averages) amplitude = ub - lb lb -= 0.2 * amplitude ub += 0.2 * amplitude plot.style.use('seaborn-darkgrid') plot.title( f'Silhouette averages on the {name} dataset using {repr(clf).split("(")[0]} and MCA' ) plot.bar(n_components, averages) plot.xticks(n_components) plot.xlabel('Number of components') plot.ylabel('Silhouette averages') plot.ylim([lb, ub]) plot.show()
def pca_eigenvalues(x_adult, x_wine): warnings.simplefilter(action='ignore', category=FutureWarning) pca = PCA(n_components=10) pca.fit(x_adult) y_adult = pca.explained_variance_ pca = PCA(n_components=10) pca.fit(x_wine) y_wine = pca.explained_variance_ mca = MCA(n_components=10) mca.fit(x_wine) y_wine2 = 100 * np.array(mca.eigenvalues_) x_axis = [k + 1 for k in range(10)] plot.style.use('seaborn-darkgrid') plot.title(f'Eigen values distributions') plot.xlabel('Eigen value index') plot.ylabel('Eigen value') plot.xticks(x_axis, x_axis) plot.plot(x_axis, np.transpose([y_adult, y_wine, y_wine2]), 'o-') plot.legend(['Adult', 'Wine reviews (PCA)', 'Wine reviews (MCA) x100'], loc='upper right') plot.show()
def mca(name, x, y): warnings.simplefilter(action='ignore', category=FutureWarning) ma = MCA(n_components=2) transformed = ma.fit_transform(x).values plot.style.use('seaborn-darkgrid') plot.title(f'MCA on {name}') plot.xlabel('First dimension') plot.ylabel('Second dimension') plot.scatter(transformed[:, 0], transformed[:, 1], c=y, cmap='viridis') plot.show()
def mca_reduction(df, columns, n_component, drop=False): values = df[columns] mca = MCA(n_components=n_component) new_df = pd.concat([ df, pd.DataFrame(mca.fit_transform(values).values, columns=[f'mca_{x}' for x in range(1, n_component + 1)]) ], axis=1) if drop: new_df.drop(columns=columns, inplace=True) return new_df, mca.explained_inertia_
def mca(): pipe_mca = make_pipeline(specific(), nominal(), MCA(n_components=25)) pipe_numeric = make_pipeline(specific(), numeric()) return make_pipeline( FeatureUnion([('mca_on_nominal', pipe_mca), ('rest_of_data', pipe_numeric)]), scale())
class DFMCA(BaseEstimator, TransformerMixin): # NOTE: # - DFMCA(n_components=df[columns].apply(lambda x: len(x.unique())).sum()) to remain every dimensions # - Ensure to convert binary encoded features as string, to ensure prince.MCA() will generate new one-hot encoded features by calling pd.get_dummies() def __init__(self, columns=None, prefix='mca_', **kwargs): self.columns = columns self.prefix = prefix self.model = MCA(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols]) # Reference: Reference: https://www.appliedaicourse.com/lecture/11/applied-machine-learning-online-course/2896/pca-for-dimensionality-reduction-not-visualization/0/free-videos self.stat_df = pd.DataFrame({ 'dimension': [x+1 for x in range(len(self.model.eigenvalues_))], 'eigenvalues': self.model.eigenvalues_, 'explained_inertia': self.model.explained_inertia_, 'cumsum_explained_inertia': np.cumsum(self.model.explained_inertia_) }) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = self.model.transform(X[self.transform_cols]) new_X.rename(columns=dict(zip(new_X.columns, [f'{self.prefix}{x}' for x in new_X.columns])), inplace=True) new_X = pd.concat([X.drop(columns=self.transform_cols), new_X], axis=1) return new_X def fit_transform(self, X, y=None): return self.fit(X).transform(X)
def __init__(self, state, n_districts, n_seeds=200, use_MCA=False, use_binary_features=True, binary_n=2, n_components=10, **kwargs): self.use_MCA = use_MCA self.use_binary_features = use_binary_features self.binary_n = binary_n # How many pairs to use, multiplier of the number of tiles. if use_MCA: try: from prince import MCA except ImportError: print('price not imported. Run "pip install prince"') exit() self.dim_reduction = MCA(n_components=n_components) else: try: from sklearn.decomposition import PCA except ImportError: print('sklearn not imported. Run "pip install sklearn"') exit() self.dim_reduction = PCA(n_components=n_components) seeds = [ districts.make_random(state, n_districts) for _ in range(n_seeds) ] if use_binary_features: n = state.n_tiles * self.binary_n self.binary_idxs_a = np.random.randint(0, high=state.n_tiles - 1, size=n) self.binary_idxs_b = np.random.randint(0, high=state.n_tiles - 1, size=n) seeds = [self._makeBinaryFeature(f) for f in seeds] self.dim_reduction.fit(np.array(seeds)) self.archive = np.array(self.dim_reduction.transform(seeds)).tolist() super().__init__(state, n_districts, **kwargs)
def preprocess(df, train_length): df.drop([ 'redFirstBlood', 'red_firstInhibitor', 'red_firstBaron', 'red_firstRiftHerald', 'gameId' ], axis=1, inplace=True) champ_cols = [ 'blue_champ_1', 'blue_champ_2', 'blue_champ_3', 'blue_champ_4', 'blue_champ_5', 'red_champ_1', 'red_champ_2', 'red_champ_3', 'red_champ_4', 'red_champ_5', 'ban_1', 'ban_2', 'ban_3', 'ban_4', 'ban_5', 'ban_6', 'ban_7', 'ban_8', 'ban_9', 'ban_10' ] train_target = df['blueWins'].iloc[:train_length].reset_index(drop=True) test_target = df['blueWins'].iloc[train_length:].reset_index(drop=True) df.drop(['blueWins', 'redWins'], axis=1, inplace=True) for col in champ_cols: df[col] = Champion.get_champions(list(df[col].values)) removal_list = set() for champ_col in champ_cols: for key, value in df[champ_col].value_counts( ascending=False).to_dict().items(): if value < 80: for i in range(10): removal_list.add('ban_{}_{}'.format(i, key)) for i in range(5): removal_list.add('blue_champ_{}_{}'.format(i, key)) removal_list.add('red_champ_{}_{}'.format(i, key)) numerical_cols = [ i for i in df if df[i].dtype in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] ] #evaluate_dist(df, numerical_cols) cols_to_be_transformed = [ 'blueWardsDestroyed', 'redWardsDestroyed', 'blueWardsPlaced', 'redWardsPlaced', 'redTowersDestroyed', 'blueTowersDestroyed' ] for col in cols_to_be_transformed: df[col] = np.log1p(df[col]) #evaluate_dist(df, cols_to_be_transformed) train_df = df.iloc[:train_length].reset_index(drop=True) test_df = df.iloc[train_length:, :].reset_index(drop=True) train_df_for_scale = train_df[ train_df.columns[~train_df.columns.isin(champ_cols)]] scaler = RobustScaler() scaled_data = scaler.fit_transform(train_df_for_scale) pca = PCA(.95) pcs = pca.fit_transform(scaled_data) train_pca_df = pd.DataFrame( pcs, columns=['PC_{}'.format(i) for i in range(np.size(pcs, 1))]) champ_df = train_df[train_df.columns[train_df.columns.isin(champ_cols)]] champ_select_df = champ_df[champ_cols[:10]] champ_ban_df = champ_df[champ_cols[10:]] mca_ban = MCA(n_components=5) mca_select = MCA(n_components=3) ban_mca = mca_ban.fit_transform(champ_ban_df) select_mca = mca_select.fit_transform(champ_select_df) ban_mca.columns = [ 'MCA_Ban_{}'.format(i) for i in range(np.size(ban_mca, 1)) ] select_mca.columns = [ 'MCA_Select_{}'.format(i) for i in range(np.size(select_mca, 1)) ] train_reduced_df = pd.concat([ban_mca, select_mca, train_pca_df], axis=1) test_df_for_scale = test_df[ test_df.columns[~test_df.columns.isin(champ_cols)]] scaled_data = scaler.transform(test_df_for_scale) pcs = pca.transform(scaled_data) test_pca_df = pd.DataFrame( pcs, columns=['PC_{}'.format(i) for i in range(np.size(pcs, 1))]) champ_df = test_df[test_df.columns[test_df.columns.isin(champ_cols)]] champ_select_df = champ_df[champ_cols[:10]] champ_ban_df = champ_df[champ_cols[10:]] ban_mca = mca_ban.fit_transform(champ_ban_df) select_mca = mca_select.fit_transform(champ_select_df) ban_mca.columns = [ 'MCA_Ban_{}'.format(i) for i in range(np.size(ban_mca, 1)) ] select_mca.columns = [ 'MCA_Select_{}'.format(i) for i in range(np.size(select_mca, 1)) ] test_reduced_df = pd.concat([ban_mca, select_mca, test_pca_df], axis=1) return train_reduced_df, test_reduced_df, train_target, test_target
def mca(df, k): """The executed CA.""" return MCA(df, n_components=k)
def reduction_dims(self, cols=None, method="pca", final_number_dims=2, visualize=True): if not self.is_standardize: raise ValueError("You should standardize your columns first.") if not cols: # Will use all the columns of the dataset on the dim reduction analysis cols = self.dataset.columns.tolist() if method == "pca": pca = PCA(n_components=final_number_dims) principal_components = pca.fit_transform(self.dataset[cols]) for index in range(0, final_number_dims): self.dataset[f"PC{index + 1}"] = principal_components[:, index] logger.info( "Principal components analysis finished. Explained variance ratio:" ) components_variance = [ "{:.12f}".format(i)[:8] for i in pca.explained_variance_ratio_ ] logger.info(components_variance) if visualize and final_number_dims == 2: x = self.dataset["PC1"] y = self.dataset["PC2"] scalex = 1.0 / (x.max() - x.min()) scaley = 1.0 / (y.max() - y.min()) coeff = np.transpose(pca.components_) fig = plt.figure(figsize=(8, 8)) sp = fig.add_subplot(1, 1, 1) sp.set_xlabel( f"PC 1 - Variance ratio: {components_variance[0]}", fontsize=15) sp.set_ylabel( f"PC 2 - Variance ratio: {components_variance[1]}", fontsize=15) sp.set_xlim(-1, 1) sp.set_ylim(-1, 1) sp.set_title("PCA Biplot", fontsize=20) sp.scatter(x * scalex, y * scaley, s=50) for i, col in enumerate(cols): plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color="r", head_width=0.02, length_includes_head=True) plt.text(coeff[i, 0] / 2, coeff[i, 1] / 2, col, color="g", ha="left", va="baseline") sp.grid() plt.show() elif method == "mca": mca = MCA(n_components=final_number_dims) dataset_mca = mca.fit_transform(self.dataset[cols]) for index in range(0, final_number_dims): self.dataset[f"MC{index + 1}"] = dataset_mca[index] logger.info( "Multiple correspondence analysis finished. Explained variance ratio:" ) mca_variance = [ "{:.12f}".format(i)[:8] for i in mca.explained_inertia_ ] logger.info(mca_variance) if visualize and final_number_dims == 2: mca.plot_coordinates(X=self.dataset[cols]) else: raise ValueError( "Method of dimensionality reduction not implemented.")
df_km = KMeans_Feature().fit(train[numeric_columns]).transform( train[numeric_columns]) train = pd.concat([train, df_km], axis=1, sort=True) df_km = KMeans_Feature().fit(test[numeric_columns]).transform( test[numeric_columns]) test = pd.concat([test, df_km], axis=1, sort=True) y = train.Survived x = train.drop(columns=['Survived']) xtrain, xval, ytrain, yval = TTS(x, y, test_size=0.3, random_state=42, stratify=y) categoric_transformer = Pipeline(steps=[('MCA', MCA(n_components=2))]) preprocessor = ColumnTransformer(transformers=[('cat', categoric_transformer, categoric_columns)]) pipe = Pipeline( steps=[('preprocessor', preprocessor), ('Scaler', StandardScaler( )), ('PCA', KernelPCA(n_components=4, kernel='rbf')), ('XGB', xgb.XGBClassifier())]) RSCparameter = { 'XGB__n_estimators': st.randint(300, 2000), 'XGB__learning_rate': st.uniform(0.01, 0.1), 'XGB__gamma': st.uniform(0.01, 0.5), 'XGB__reg_alpha': st.uniform(0.01, 0.5),
def __init__(self, columns=None, prefix='mca_', **kwargs): self.columns = columns self.prefix = prefix self.model = MCA(**kwargs) self.transform_cols = None self.stat_df = None