Ejemplo n.º 1
0
def mca_benchmark(name, x, clf):
    warnings.simplefilter(action='ignore', category=FutureWarning)
    averages = []
    n_components = range(1, 5)

    for n_component in n_components:
        mca = MCA(n_components=n_component)
        transformed = mca.fit_transform(x).values
        predicted_labels = clf.fit_predict(transformed)
        silhouette_avg = silhouette_score(transformed, predicted_labels)
        averages.append(silhouette_avg)

    lb = np.min(averages)
    ub = np.max(averages)
    amplitude = ub - lb
    lb -= 0.2 * amplitude
    ub += 0.2 * amplitude

    plot.style.use('seaborn-darkgrid')
    plot.title(
        f'Silhouette averages on the {name} dataset using {repr(clf).split("(")[0]} and MCA'
    )
    plot.bar(n_components, averages)
    plot.xticks(n_components)
    plot.xlabel('Number of components')
    plot.ylabel('Silhouette averages')
    plot.ylim([lb, ub])
    plot.show()
Ejemplo n.º 2
0
def pca_eigenvalues(x_adult, x_wine):
    warnings.simplefilter(action='ignore', category=FutureWarning)
    pca = PCA(n_components=10)
    pca.fit(x_adult)
    y_adult = pca.explained_variance_

    pca = PCA(n_components=10)
    pca.fit(x_wine)
    y_wine = pca.explained_variance_

    mca = MCA(n_components=10)
    mca.fit(x_wine)
    y_wine2 = 100 * np.array(mca.eigenvalues_)

    x_axis = [k + 1 for k in range(10)]

    plot.style.use('seaborn-darkgrid')
    plot.title(f'Eigen values distributions')
    plot.xlabel('Eigen value index')
    plot.ylabel('Eigen value')
    plot.xticks(x_axis, x_axis)
    plot.plot(x_axis, np.transpose([y_adult, y_wine, y_wine2]), 'o-')
    plot.legend(['Adult', 'Wine reviews (PCA)', 'Wine reviews (MCA) x100'],
                loc='upper right')
    plot.show()
Ejemplo n.º 3
0
def mca(name, x, y):
    warnings.simplefilter(action='ignore', category=FutureWarning)
    ma = MCA(n_components=2)
    transformed = ma.fit_transform(x).values

    plot.style.use('seaborn-darkgrid')
    plot.title(f'MCA on {name}')
    plot.xlabel('First dimension')
    plot.ylabel('Second dimension')
    plot.scatter(transformed[:, 0], transformed[:, 1], c=y, cmap='viridis')
    plot.show()
Ejemplo n.º 4
0
def mca_reduction(df, columns, n_component, drop=False):
    values = df[columns]
    mca = MCA(n_components=n_component)

    new_df = pd.concat([
        df,
        pd.DataFrame(mca.fit_transform(values).values,
                     columns=[f'mca_{x}' for x in range(1, n_component + 1)])
    ],
                       axis=1)

    if drop:
        new_df.drop(columns=columns, inplace=True)

    return new_df, mca.explained_inertia_
Ejemplo n.º 5
0
def mca():
    pipe_mca = make_pipeline(specific(), nominal(), MCA(n_components=25))
    pipe_numeric = make_pipeline(specific(), numeric())

    return make_pipeline(
        FeatureUnion([('mca_on_nominal', pipe_mca),
                      ('rest_of_data', pipe_numeric)]), scale())
Ejemplo n.º 6
0
class DFMCA(BaseEstimator, TransformerMixin):
    # NOTE:
    # - DFMCA(n_components=df[columns].apply(lambda x: len(x.unique())).sum()) to remain every dimensions
    # - Ensure to convert binary encoded features as string, to ensure prince.MCA() will generate new one-hot encoded features by calling pd.get_dummies()
    def __init__(self, columns=None, prefix='mca_', **kwargs):
        self.columns        = columns
        self.prefix         = prefix
        self.model          = MCA(**kwargs)
        self.transform_cols = None
        self.stat_df        = None
        
    def fit(self, X, y=None):
        self.columns        = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        # Reference: Reference: https://www.appliedaicourse.com/lecture/11/applied-machine-learning-online-course/2896/pca-for-dimensionality-reduction-not-visualization/0/free-videos
        self.stat_df = pd.DataFrame({
            'dimension': [x+1 for x in range(len(self.model.eigenvalues_))],
            'eigenvalues': self.model.eigenvalues_,
            'explained_inertia': self.model.explained_inertia_,
            'cumsum_explained_inertia': np.cumsum(self.model.explained_inertia_)
        })

        return self
    
    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = self.model.transform(X[self.transform_cols])
        new_X.rename(columns=dict(zip(new_X.columns, [f'{self.prefix}{x}' for x in new_X.columns])), inplace=True)
        new_X = pd.concat([X.drop(columns=self.transform_cols), new_X], axis=1)

        return new_X
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
Ejemplo n.º 7
0
    def __init__(self,
                 state,
                 n_districts,
                 n_seeds=200,
                 use_MCA=False,
                 use_binary_features=True,
                 binary_n=2,
                 n_components=10,
                 **kwargs):
        self.use_MCA = use_MCA
        self.use_binary_features = use_binary_features
        self.binary_n = binary_n  # How many pairs to use, multiplier of the number of tiles.
        if use_MCA:
            try:
                from prince import MCA
            except ImportError:
                print('price not imported. Run "pip install prince"')
                exit()
            self.dim_reduction = MCA(n_components=n_components)
        else:
            try:
                from sklearn.decomposition import PCA
            except ImportError:
                print('sklearn not imported. Run "pip install sklearn"')
                exit()
            self.dim_reduction = PCA(n_components=n_components)

        seeds = [
            districts.make_random(state, n_districts) for _ in range(n_seeds)
        ]

        if use_binary_features:
            n = state.n_tiles * self.binary_n
            self.binary_idxs_a = np.random.randint(0,
                                                   high=state.n_tiles - 1,
                                                   size=n)
            self.binary_idxs_b = np.random.randint(0,
                                                   high=state.n_tiles - 1,
                                                   size=n)
            seeds = [self._makeBinaryFeature(f) for f in seeds]

        self.dim_reduction.fit(np.array(seeds))
        self.archive = np.array(self.dim_reduction.transform(seeds)).tolist()
        super().__init__(state, n_districts, **kwargs)
def preprocess(df, train_length):
    df.drop([
        'redFirstBlood', 'red_firstInhibitor', 'red_firstBaron',
        'red_firstRiftHerald', 'gameId'
    ],
            axis=1,
            inplace=True)

    champ_cols = [
        'blue_champ_1', 'blue_champ_2', 'blue_champ_3', 'blue_champ_4',
        'blue_champ_5', 'red_champ_1', 'red_champ_2', 'red_champ_3',
        'red_champ_4', 'red_champ_5', 'ban_1', 'ban_2', 'ban_3', 'ban_4',
        'ban_5', 'ban_6', 'ban_7', 'ban_8', 'ban_9', 'ban_10'
    ]

    train_target = df['blueWins'].iloc[:train_length].reset_index(drop=True)
    test_target = df['blueWins'].iloc[train_length:].reset_index(drop=True)

    df.drop(['blueWins', 'redWins'], axis=1, inplace=True)

    for col in champ_cols:
        df[col] = Champion.get_champions(list(df[col].values))

    removal_list = set()

    for champ_col in champ_cols:

        for key, value in df[champ_col].value_counts(
                ascending=False).to_dict().items():
            if value < 80:
                for i in range(10):
                    removal_list.add('ban_{}_{}'.format(i, key))
                for i in range(5):
                    removal_list.add('blue_champ_{}_{}'.format(i, key))
                    removal_list.add('red_champ_{}_{}'.format(i, key))

    numerical_cols = [
        i for i in df if df[i].dtype in
        ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    ]

    #evaluate_dist(df, numerical_cols)

    cols_to_be_transformed = [
        'blueWardsDestroyed', 'redWardsDestroyed', 'blueWardsPlaced',
        'redWardsPlaced', 'redTowersDestroyed', 'blueTowersDestroyed'
    ]

    for col in cols_to_be_transformed:
        df[col] = np.log1p(df[col])

    #evaluate_dist(df, cols_to_be_transformed)

    train_df = df.iloc[:train_length].reset_index(drop=True)
    test_df = df.iloc[train_length:, :].reset_index(drop=True)

    train_df_for_scale = train_df[
        train_df.columns[~train_df.columns.isin(champ_cols)]]

    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train_df_for_scale)
    pca = PCA(.95)
    pcs = pca.fit_transform(scaled_data)

    train_pca_df = pd.DataFrame(
        pcs, columns=['PC_{}'.format(i) for i in range(np.size(pcs, 1))])

    champ_df = train_df[train_df.columns[train_df.columns.isin(champ_cols)]]
    champ_select_df = champ_df[champ_cols[:10]]
    champ_ban_df = champ_df[champ_cols[10:]]

    mca_ban = MCA(n_components=5)
    mca_select = MCA(n_components=3)

    ban_mca = mca_ban.fit_transform(champ_ban_df)
    select_mca = mca_select.fit_transform(champ_select_df)

    ban_mca.columns = [
        'MCA_Ban_{}'.format(i) for i in range(np.size(ban_mca, 1))
    ]
    select_mca.columns = [
        'MCA_Select_{}'.format(i) for i in range(np.size(select_mca, 1))
    ]

    train_reduced_df = pd.concat([ban_mca, select_mca, train_pca_df], axis=1)

    test_df_for_scale = test_df[
        test_df.columns[~test_df.columns.isin(champ_cols)]]

    scaled_data = scaler.transform(test_df_for_scale)

    pcs = pca.transform(scaled_data)

    test_pca_df = pd.DataFrame(
        pcs, columns=['PC_{}'.format(i) for i in range(np.size(pcs, 1))])

    champ_df = test_df[test_df.columns[test_df.columns.isin(champ_cols)]]
    champ_select_df = champ_df[champ_cols[:10]]
    champ_ban_df = champ_df[champ_cols[10:]]

    ban_mca = mca_ban.fit_transform(champ_ban_df)
    select_mca = mca_select.fit_transform(champ_select_df)

    ban_mca.columns = [
        'MCA_Ban_{}'.format(i) for i in range(np.size(ban_mca, 1))
    ]
    select_mca.columns = [
        'MCA_Select_{}'.format(i) for i in range(np.size(select_mca, 1))
    ]

    test_reduced_df = pd.concat([ban_mca, select_mca, test_pca_df], axis=1)

    return train_reduced_df, test_reduced_df, train_target, test_target
Ejemplo n.º 9
0
def mca(df, k):
    """The executed CA."""
    return MCA(df, n_components=k)
Ejemplo n.º 10
0
    def reduction_dims(self,
                       cols=None,
                       method="pca",
                       final_number_dims=2,
                       visualize=True):
        if not self.is_standardize:
            raise ValueError("You should standardize your columns first.")
        if not cols:
            # Will use all the columns of the dataset on the dim reduction analysis
            cols = self.dataset.columns.tolist()

        if method == "pca":
            pca = PCA(n_components=final_number_dims)
            principal_components = pca.fit_transform(self.dataset[cols])

            for index in range(0, final_number_dims):
                self.dataset[f"PC{index + 1}"] = principal_components[:, index]

            logger.info(
                "Principal components analysis finished. Explained variance ratio:"
            )
            components_variance = [
                "{:.12f}".format(i)[:8] for i in pca.explained_variance_ratio_
            ]
            logger.info(components_variance)

            if visualize and final_number_dims == 2:
                x = self.dataset["PC1"]
                y = self.dataset["PC2"]
                scalex = 1.0 / (x.max() - x.min())
                scaley = 1.0 / (y.max() - y.min())
                coeff = np.transpose(pca.components_)

                fig = plt.figure(figsize=(8, 8))
                sp = fig.add_subplot(1, 1, 1)
                sp.set_xlabel(
                    f"PC 1 - Variance ratio: {components_variance[0]}",
                    fontsize=15)
                sp.set_ylabel(
                    f"PC 2 - Variance ratio: {components_variance[1]}",
                    fontsize=15)
                sp.set_xlim(-1, 1)
                sp.set_ylim(-1, 1)
                sp.set_title("PCA Biplot", fontsize=20)
                sp.scatter(x * scalex, y * scaley, s=50)
                for i, col in enumerate(cols):
                    plt.arrow(0,
                              0,
                              coeff[i, 0],
                              coeff[i, 1],
                              color="r",
                              head_width=0.02,
                              length_includes_head=True)
                    plt.text(coeff[i, 0] / 2,
                             coeff[i, 1] / 2,
                             col,
                             color="g",
                             ha="left",
                             va="baseline")
                sp.grid()
                plt.show()

        elif method == "mca":
            mca = MCA(n_components=final_number_dims)
            dataset_mca = mca.fit_transform(self.dataset[cols])

            for index in range(0, final_number_dims):
                self.dataset[f"MC{index + 1}"] = dataset_mca[index]

            logger.info(
                "Multiple correspondence analysis finished. Explained variance ratio:"
            )
            mca_variance = [
                "{:.12f}".format(i)[:8] for i in mca.explained_inertia_
            ]
            logger.info(mca_variance)

            if visualize and final_number_dims == 2:
                mca.plot_coordinates(X=self.dataset[cols])
        else:
            raise ValueError(
                "Method of dimensionality reduction not implemented.")
df_km = KMeans_Feature().fit(train[numeric_columns]).transform(
    train[numeric_columns])
train = pd.concat([train, df_km], axis=1, sort=True)
df_km = KMeans_Feature().fit(test[numeric_columns]).transform(
    test[numeric_columns])
test = pd.concat([test, df_km], axis=1, sort=True)

y = train.Survived
x = train.drop(columns=['Survived'])
xtrain, xval, ytrain, yval = TTS(x,
                                 y,
                                 test_size=0.3,
                                 random_state=42,
                                 stratify=y)

categoric_transformer = Pipeline(steps=[('MCA', MCA(n_components=2))])

preprocessor = ColumnTransformer(transformers=[('cat', categoric_transformer,
                                                categoric_columns)])

pipe = Pipeline(
    steps=[('preprocessor', preprocessor), ('Scaler', StandardScaler(
    )), ('PCA',
         KernelPCA(n_components=4, kernel='rbf')), ('XGB',
                                                    xgb.XGBClassifier())])

RSCparameter = {
    'XGB__n_estimators': st.randint(300, 2000),
    'XGB__learning_rate': st.uniform(0.01, 0.1),
    'XGB__gamma': st.uniform(0.01, 0.5),
    'XGB__reg_alpha': st.uniform(0.01, 0.5),
Ejemplo n.º 12
0
 def __init__(self, columns=None, prefix='mca_', **kwargs):
     self.columns        = columns
     self.prefix         = prefix
     self.model          = MCA(**kwargs)
     self.transform_cols = None
     self.stat_df        = None