Example #1
0
def perform_correspondence_analysis(data_frame):
    mca = prince.MCA()

    mca = prince.MCA(n_components=2,
                     n_iter=3,
                     copy=True,
                     check_input=True,
                     engine='auto',
                     random_state=42)
    ptumor_mca = mca.fit(data_frame)

    ax = ptumor_mca.plot_coordinates(X=data_frame,
                                     ax=None,
                                     figsize=(10, 10),
                                     show_row_points=False,
                                     row_points_size=0,
                                     show_row_labels=False,
                                     show_column_points=True,
                                     column_points_size=30,
                                     show_column_labels=True,
                                     legend_n_cols=1).legend(
                                         loc='center left',
                                         bbox_to_anchor=(1, 0.5))

    plt.show()
Example #2
0
 def do_mca(self):
     import prince
     x_train, x_test, y_train, y_test = self.scale_data()
     mca = prince.MCA(n_components=12, random_state=42)
     x_train = mca.fit_transform(x_train)
     x_test = mca.fit(x_test)
     return x_train, x_test, y_train, y_test
Example #3
0
def peformMCA(state):
    global persondatadf, mca_res
    cat_data = persondatadf[['SCHL', 'RAC1P',  'ESR', 'ST']]

    if state > 0:
        cat_data = cat_data[cat_data['ST'] == state]

    cat_data = cat_data.sample(frac=0.1)
    cat_data.drop(columns="ST", inplace=True)

    cat_data['RAC1P'] = cat_data['RAC1P'].apply(lambda race: races.get(race))
    cat_data['SCHL'] = cat_data['SCHL'].apply(lambda deg: degrees.get(deg))
    cat_data['ESR'] = cat_data['ESR'].apply(lambda emp: employment.get(emp))
    cat_data = cat_data.fillna("Unknown")
    mca = prince.MCA(
         n_components=2,
         n_iter=3,
         copy=True,
         check_input=True,
         engine='auto',
         random_state=42
    )
    mca = mca.fit(cat_data)
    mca_res = mca.column_coordinates(cat_data)
    mca_res.reset_index(inplace=True)
    types = {
        "SCHL": 1,
        "RAC1P": 2,
        "ESR": 3
    }
    mca_res['type'] = mca_res['index'].apply(lambda idx: types.get(idx.split('_')[0]))
    mca_res['index'] = mca_res['index'].apply(lambda idx: idx.split('_')[1])
    mca_res.rename(columns={0: 'x', 1: 'y', index: 'cat'}, inplace=True)
Example #4
0
def generate_mca(df, save_path=None):
    """
    :param df: dataframe entered that contains categorical variables but can also contain numerical ones
    :param save_path: path for saving the figure
    :return:
    """
    cat_feature_list = [
        x for x in df.columns if x not in df._get_numeric_data().columns
    ]
    df_cc_cat = df[cat_feature_list]
    mca = prince.MCA(n_components=2,
                     n_iter=3,
                     copy=True,
                     check_input=True,
                     engine='auto',
                     random_state=42)

    mca.fit(df_cc_cat.astype('category'))

    mca.plot_coordinates(X=df_cc_cat.astype('category'),
                         ax=None,
                         figsize=(7, 7),
                         show_row_points=False,
                         row_points_size=30,
                         show_row_labels=False,
                         show_column_points=True,
                         column_points_size=40,
                         show_column_labels=True,
                         legend_n_cols=2)
    if save_path is not None:
        plt.savefig(save_path / Path('mca.png'))
        plt.close('all')
Example #5
0
 def mca(self, df, cols):
     """
     cols: list of categorical column names
     
     Do a multiple correspondence analysis
     """
     mca = prince.MCA(n_components=2,
                      n_iter=3,
                      copy=True,
                      check_input=True,
                      engine='auto')
     mca.fit(df[cols])
     fig, ax = plt.subplots(figsize=(10, 10))
     mca.plot_coordinates(X=df[cols],
                          ax=ax,
                          show_row_points=True,
                          row_points_size=10,
                          show_row_labels=False,
                          show_column_points=True,
                          column_points_size=30,
                          show_column_labels=False,
                          legend_n_cols=1)
     plt.show()
     print("Explained inertia:", mca.explained_inertia_)
     return mca
Example #6
0
 def test_row_cos2(self):
     mca = prince.MCA(n_components=4, random_state=42)
     mca.fit(self.X)
     r_cos2 = mca.row_cos2()
     self.assertEquals(r_cos2.shape, (6, 4))
     pd.testing.assert_index_equal(r_cos2.index, mca.row_masses_.index)
     self.assert_(np.all((r_cos2 >= 0) & (r_cos2 <= 1)), "All Cos2 should be between 0 and 1")
     self.assert_(np.all(r_cos2.sum(axis=1) <= 1), "Cos2 across dimensions should be near 1")
Example #7
0
 def test_column_cos2(self):
     mca = prince.MCA(n_components=4, random_state=42)
     mca.fit(self.X)
     c_cos2 = mca.column_cos2()
     self.assertEquals(c_cos2.shape, (22, 4))
     pd.testing.assert_index_equal(c_cos2.index, mca.col_masses_.index)
     self.assert_(np.all((c_cos2 >= 0) & (c_cos2 <= 1)), "All Cos2 should be between 0 and 1")
     # Should be really <= 1., but account for floating precision error
     self.assert_(np.all(c_cos2.sum(axis=1) <= 1.000001), "Cos2 across dimensions should be near 1")
 def categorical_MCA(self, X, n_components=4, n_iter=1000):
     X = X.copy()
     cat_data, non_cat_data = self.categorize_data(X)
     mca = prince.MCA(n_components=n_components, n_iter=n_iter)
     cat_reduced = mca.fit_transform(cat_data)
     cat_reduced.columns = [
         "MCA_{}".format(i + 1) for i, v in enumerate(cat_reduced.columns)
     ]
     return pd.concat([cat_reduced, non_cat_data], axis=1)
Example #9
0
def prince_mca(X, facet1n2_sample_ids, facet1_sample_ids, facet2_sample_ids,
               pairID, unique_samples, unique_facets):
    facet1_trunc = facet1[0:15]
    facet2_trunc = facet2[0:15]

    outF.write('Started prince_mca module...' + '\n')

    df = pd.DataFrame(data=X, index=unique_samples, columns=unique_facets)

    mca = prince.MCA(df, n_components=2)
    v = mca.n_rows

    row_principal_coordinates = mca.row_principal_coordinates
    row_principal_coordinates.columns = ['x', 'y']

    # adding info to df for graph drawing

    row_principal_coordinates.index.name = 'id'
    row_principal_coordinates['Attribute'] = 0
    row_principal_coordinates.loc[
        row_principal_coordinates.index.isin(facet1_sample_ids),
        'Attribute'] = facet1
    row_principal_coordinates.loc[
        row_principal_coordinates.index.isin(facet2_sample_ids),
        'Attribute'] = facet2
    row_principal_coordinates.loc[
        row_principal_coordinates.index.isin(facet1n2_sample_ids),
        'Attribute'] = 'both'

    # adds column describing spot frequency for depth visualisation
    # row_principal_coordinates['s'] = row_principal_coordinates.groupby(['x','y']).transform('count')
    print(row_principal_coordinates.groupby(['x', 'y']).transform('count'))
    sys.exit()

    # # To see data input
    # print(row_principal_coordinates)
    mcaname = str('data/plots/mca_' + str(pairID) + '.dat')
    row_principal_coordinates.to_csv(mcaname)

    # try:
    # # producing 3D dimension reduction for mancluster.py (generally turned off in mancluster too)
    # mca3 = prince.MCA(df, n_components=3)
    # mca3_df = mca3.row_principal_coordinates
    # mca3_df.columns = ['x', 'y', 'z']
    # mca3_df.index.name = 'id'
    # mca3_df.loc[mca3_df.index.isin(facet1_sample_ids), 'Attribute'] = facet1
    # mca3_df.loc[mca3_df.index.isin(facet2_sample_ids), 'Attribute'] = facet2
    # mca3_df.loc[mca3_df.index.isin(facet1n2_sample_ids), 'Attribute'] = 'both'
    # mca3_df['s'] = mca3_df.groupby(['x','y', 'z']).transform('count')
    # mca3name = str('data/plots/mca3d_' + str(pairID) + '.dat')
    # mca3_df.to_csv(mca3name)
    # except ValueError: # an unknown issue which rarely occurs to reproduce run phenotypes and phenotype
    # continue

    return (row_principal_coordinates, mcaname)
Example #10
0
def MCA(X):
    #X = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data')
    #X.columns = ['Color', 'Size', 'Action', 'Age', 'Inflated']

    print(X.head())
    mca = prince.MCA()

    mca = mca.fit(X)  # same as calling ca.fs_r(1)
    mca = mca.transform(
        X)  # same as calling ca.fs_r_sup(df_new) for *another* test set.
    return mca
    def dimension_reduction_pca_mca(self, df):
        ### MCA - Categorical features
        print("--------------------------")
        print(df.head())
        cat_cols = []
        if self.one_hot is True:
            cat_cols = self.encoders.get_feature_names(
                self.categorical_features_initial) + ["exist_closed"]
        else:
            cat_cols = self.categorical_features_final
        print(cat_cols)
        # If my_mca is None, it means it is df_train. So we fit my_mca
        n_comp = len(cat_cols)
        if self.my_mca is None:
            self.my_mca = prince.MCA(n_components=n_comp)
            self.my_mca.fit(df[cat_cols])

        cols = []
        for i in range(n_comp):
            cols += ["MCA_" + str(i + 1)]

        print(df[cat_cols].shape)
        print(self.my_mca)
        aux_mca = self.my_mca.transform(df[cat_cols])
        aux_mca.columns = cols
        df = df.drop(cat_cols, axis=1)
        df = df.join(aux_mca)

        ## PCA - Numerical features

        # If my_pca is 0, it means it is df_train. So we fit my_pca
        num_cols = self.numerical_features_final
        n_comp = len(num_cols)
        print(num_cols)
        if self.my_pca is None:
            self.my_pca = PCA(n_components=n_comp)
            self.my_pca.fit(df[num_cols])

        cols = []
        for i in range(n_comp):
            cols += ["PCA_" + str(i + 1)]

        print(df.head())
        print(num_cols)
        aux_pca = pd.DataFrame(self.my_pca.transform(df[num_cols]),
                               columns=cols)
        df = df.drop(num_cols, axis=1)
        df = df.join(aux_pca)
        print(self.my_pca.explained_variance_ratio_)
        print(df.head())

        return df
    def MCAX(data, cat_columns):
        """
    cat_columns: list of categorical columns
    """
        X = data[[cat_columns]]
        print(X.head())
        mca = prince.MCA()

        mca = mca.fit(X)  # same as calling ca.fs_r(1)
        mca = mca.transform(
            X)  # same as calling ca.fs_r_sup(df_new) for *another* test set.
        print(mca)
        return mca

        def OneHotEncode(self, data, column):
            encoder = ce.OneHotEncoder(cols=[column], return_df=True)
            return encoder.fit_transform(data)

        # High Cardinality of Features
        def FeatureHasher(selfself, data, column, components):
            encoder = ce.HashingEncoder(cols=column, n_components=components)
            return encoder.fit_transform(data)

        def LabelEncoding(self, data, column):
            encoder = ce.OrdinalEncoder(cols=[column], return_df=True)
            return encoder.fit_transform(data)

        # High Cardinality of Features
        def BinaryEncoder(self, data, column):
            encoder = ce.BinaryEncoder(cols=[column], return_df=True)
            return encoder.fit_transform(data)

        def feature_encode(self, data):
            l = []
            for col in data.columns:
                n = data.groupby([col])
                if n.ngroups > 10000:
                    continue
                if n.ngroups < 10000 and n.ngroups > 1000:
                    l.append(self.BinaryEncoder(data, col, 25))
                    continue
                if n.ngroups < 1000 and n.groups > 10:
                    l.append(self.FeatureHasher(data, col, 15))
                    continue
def Type_Conversion(Stats_Master):

    PCA_Master = Stats_Master.copy()

    PCA_Type = PCA_Master[['Type_1', 'Type_2']].fillna(value='None')

    mca = prince.MCA(n_components=15,
                     n_iter=100,
                     copy=False,
                     engine='auto',
                     random_state=42)
    Typed_MCA = mca.fit(PCA_Type)
    print(np.sum(Typed_MCA.explained_inertia_))

    mca_Components = Typed_MCA.U_
    df = pd.DataFrame(mca_Components, index=mca_Components[:, 0]).reset_index()
    df = df.drop(columns=['index'])

    return df
Example #14
0
def do_preprocess_3(df, target_col_name, plots_path, suffix):
    # ------------------------------------------------------------------------------------ #
    # Do data pre-processing with second approach of dimensionality reduction by taking
    # finding the similarity between various levels with clustering technique and using
    # the cluster numbers instead of the actual levels.
    # ------------------------------------------------------------------------------------ #
    # Drop a few columns as they should not have any influence on the target
    # ------------------------------------------------------------------------------------ #

    df = drop_cols(df, ["userid", "doctorid", "transdate"])
    logging.info(df.describe())
    logging.info(df.dtypes)
    # Since all the columns are categorical attributes convert to the appropriate
    # categorical type
    for col in df.columns:
        df[col] = df[col].astype('category')

    logging.info('Dimensionality Reduction with MCA')
    mca = prince.MCA(df, n_components=1100)
    # logging.info('principal components are :: '+str(mca.categorical_columns))
    # logging.info('principal components are :: ' + str(mca.column_component_contributions))
    # logging.info('principal components are :: ' + str(mca.column_correlations))
    # logging.info('principal components are :: ' + str(mca.column_cosine_similarities))
    print('MCA is :: ', mca)
    logging.info('principal components are :: ' + str(mca.eigenvalues))
    logging.info('column_correlations are :: ' + str(mca.column_correlations))
    logging.info('cumulative_explained_inertia are :: ' +
                 str(mca.cumulative_explained_inertia))
    logging.info('explained_inertia are :: ' + str(mca.explained_inertia))
    logging.info('cumulative_explained_inertia are :: ' +
                 str(mca.row_cosine_similarities))
    logging.info(' row_principal_coordinates are :: ' +
                 str(mca.row_principal_coordinates))
    # logging.info('principal components are :: ' + str(mca.column_standard_coordinates))
    # mca.plot_rows(show_points=True, show_labels=False, ellipse_fill=True)
    # mca.plot_relationship_square()
    mca.plot_cumulative_inertia(threshold=0.8)
    # plt.savefig(str(plots_path) + 'MCA_Analysis_Cumulative_Inertia_'+suffix + '.png')
    print(mca.head())
    logging.info("Pre-processed data frame :: ")
    logging.info(df.describe())
    logging.info(df.dtypes)
Example #15
0
def do_MCA(X, n_components=10):
    '''
    Performs multiple correspondance analysis on X
    
    '''
    warnings.filterwarnings("ignore", category=FutureWarning)
    # run the MCA using prince
    mca = prince.MCA(n_components=n_components)
    mca = mca.fit(X)

    # individual loadings onto components
    mca.ind_scores = mca.row_coordinates(X).values

    # edge loadings onto components
    edge_scores = mca.column_coordinates(X).values

    # exclude every other row (the zero loadings)
    mca.edge_scores = edge_scores[1::2, :]

    return mca
Example #16
0
def SampleAndMCA():

    FIELDS = {
        'Council District': True,
        'Report Day': True,
        'Clearance Day': True,
        'Highest NIBRS/UCR Offense Description': True,
        'GO Location Zip': True,
        '_id': False,
        "Clearance Status": True
    }

    connection = MongoClient(MONGODB_HOST, MONGODB_PORT)
    collection = connection[DBS_NAME][COLLECTION_NAME]
    projects = collection.find(projection=FIELDS)

    sample = []
    cnt = 0
    limit = 10000
    for project in projects:
        if (cnt < limit):
            sample.append(project)
        else:
            idx = random.randint(0, cnt + 1)
            if (idx < limit):
                sample[idx] = project
        cnt = cnt + 1

    df = pd.DataFrame(sample)

    mca = prince.MCA(df, n_components=2)

    fig1, ax1 = mca.plot_cumulative_inertia()
    fig3, ax3 = mca.plot_rows_columns()
    fig4, ax4 = mca.plot_relationship_square()

    plt.show()

    return json.dumps("Sampling and Multiple Component Analysis done")
Example #17
0
 def test_plot_show_column_labels(self):
     mca = prince.MCA(n_components=2)
     mca.fit(self.X)
     ax = mca.plot_coordinates(self.X, show_column_labels=True)
     self.assertTrue(isinstance(ax, mpl.axes.Axes))
Example #18
0
 def test_pandas_dataframe(self):
     mca = prince.MCA(n_components=2)
     self.assertTrue(isinstance(mca.fit(self.X), prince.MCA))
     self.assertTrue(isinstance(mca.transform(self.X), pd.DataFrame))
Example #19
0
 def test_numpy_array(self):
     mca = prince.MCA(n_components=2)
     self.assertTrue(isinstance(mca.fit(self.X.to_numpy()), prince.MCA))
     self.assertTrue(
         isinstance(mca.transform(self.X.to_numpy()), pd.DataFrame))
Example #20
0
df = pd.read_csv('data/datalab_persona_run1_with_scale_cat_classless.csv')

df2 = pd.read_csv('data/datalab_persona_run1_with_scale_cat.csv')
df_class = df2['class'].values

cols = ['g' if x=='smoker' else 'b' for x in df_class]

# df = pd.read_csv('data/ogm.csv')

# cols = [x for x in df2.columns.values if
#         x not in ['Age Next at DOC', 'Height', 'Weight', 'Annual Salary', 'Travel %']]

# df = pd.get_dummies(df)

mca = prince.MCA(df, n_components=-1)

# Set the axes you want to examine below, i.e. which component pair you are interested in - (0, 1)

vals = mca.row_principal_coordinates

print(len(vals))

vals=vals.values


plt.scatter(vals[:,0], vals[:,1], c=cols)

mca = prince.MCA(df2, n_components=-1)

# Set the axes you want to examine below, i.e. which component pair you are interested in - (0, 1)
Example #21
0
 def test_eigenvalues_are_corrected(self):
     mca = prince.MCA(n_components=4, random_state=42)
     mca.fit(self.X)
     self.assertEquals(mca.K, 10)
     np.testing.assert_allclose(mca.eigenvalues_, [.7004, .0123, .0003, 0 ], atol=0.0001)
Example #22
0
        tend_doy = pd.to_datetime(row['tend']).dayofyear

        #calculate the median DOY
        doy_mean = int((tstart_doy + tend_doy) / 2)
        doy_mean_year.append(doy_mean)

    doy_mean_year = np.asarray(doy_mean_year)
    doy_mean_year = doy_mean_year.reshape(-1, 1)

    # Encoding mean DOY with detection
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(doy_mean_year)
    doy_mean_enc = enc.transform(doy_mean_year).toarray()

    # MCA of detected DOYs
    mca = prince.MCA()
    mca = mca.fit(doy_mean_enc)
    mca = mca.transform(doy_mean_enc)
    mca_doy = np.array(mca)

    ####################################################################
    ## Stacking of variables of each features:
    # - number of detections
    # - avg intensity
    # - started month of detection
    stack = np.column_stack((det_int, mca_doy))

    # Principal Component Analysis (PCA)
    # Reducing the dimensions of stack from four to two
    pca = PCA(n_components=2)
    principal_comp = pca.fit_transform(stack)
Example #23
0
 def test_total_inertia(self):
     mca = prince.MCA(n_components=4, random_state=42)
     mca.fit(self.X)
     np.testing.assert_almost_equal(mca.total_inertia_, 0.7130, 4)    
Example #24
0
    def visualizePoints_comparison(self,
                                   Sn_inst,
                                   datapoints,
                                   fig2,
                                   position,
                                   reductionMethod="pca",
                                   training=False):
        from sklearn import decomposition
        count_inst = self.explain_indices.index(Sn_inst)
        n_inst = int(Sn_inst)
        instTmp2 = Orange.data.Instance(self.explain_dataset.domain,
                                        self.explain_dataset[count_inst])
        c = self.classifier(instTmp2, False)

        labelledInstance = deepcopy(instTmp2)
        X = datapoints.X
        y = datapoints.Y

        if reductionMethod == "pca":
            pca = decomposition.PCA(n_components=3)
            pca.fit(X)
            X = pca.transform(X)
            istance_transformed = pca.transform([labelledInstance.x])

        elif reductionMethod == "mca":
            import pandas as pd
            import prince

            dataK = []
            for k in range(0, len(datapoints)):
                dataK.append(datapoints[k].list)

            columnsA = [i.name for i in datapoints.domain.variables]

            if datapoints.domain.metas != ():
                for i in range(0, len(datapoints.domain.metas)):
                    columnsA.append(datapoints.domain.metas[i].name)
            data = pd.DataFrame(data=dataK, columns=columnsA)

            columnsA = [i.name for i in datapoints.domain.attributes]
            Xa = data[columnsA]
            y = datapoints.Y

            mca = prince.MCA(n_components=3,
                             n_iter=3,
                             copy=True,
                             check_input=True,
                             engine='auto',
                             random_state=42)
            mca.fit(Xa)
            X = mca.transform(Xa)
            istance_transformed = mca.transform([[
                labelledInstance[i].value
                for i in labelledInstance.domain.attributes
            ]])

        elif reductionMethod == "t-sne":
            from sklearn.manifold import TSNE
            XX = np.vstack([X, labelledInstance.x])
            label_istance = float(max(list(self.map_names_class.keys())) + 1)
            yy = np.concatenate((y, np.array([label_istance])))
            tsne = TSNE(n_components=2, random_state=0)
            tsne.fit(XX)
            XX = tsne.fit_transform(XX)

        else:
            print("Reduction method available: pca, t-sne, selected",
                  reductionMethod)
        label_istance = float(max(list(self.map_names_class.keys())) + 1)
        y_l = y.astype(int)
        labelMapNames = self.map_names_class.items()
        instance_label_name = self.map_names_class[int(labelledInstance.y)]

        if reductionMethod == "pca" or reductionMethod == "mca":
            XX = np.vstack([X, istance_transformed])
            yy = np.concatenate((y, np.array([label_istance])))
            ax = fig2.add_subplot(1, 2, position, projection='3d')

            # ax = Axes3D(fig, rect=[0, 0, .7, 1], elev=48, azim=134)
            sc = ax.scatter(XX[:, 0],
                            XX[:, 1],
                            XX[:, 2],
                            c=yy,
                            cmap="Spectral",
                            edgecolor='k')
            ax.w_xaxis.set_ticklabels([])
            ax.w_yaxis.set_ticklabels([])
            ax.w_zaxis.set_ticklabels([])
            label_values = list(np.unique(y_l))
            label_values.append(int(label_istance))
        else:
            ax = fig2.add_subplot(1, 2, position)
            sc = ax.scatter(XX[:, 0], XX[:, 1], c=yy, cmap="tab10")
            ax.xaxis.set_ticklabels([])
            ax.yaxis.set_ticklabels([])
            label_values = list(np.unique(yy.astype(int)))

        colors = [sc.cmap(sc.norm(i)) for i in label_values]
        custom_lines = [
            plt.Line2D([], [],
                       ls="",
                       marker='.',
                       mec='k',
                       mfc=c,
                       mew=.1,
                       ms=20) for c in colors
        ]

        d2 = dict(labelMapNames)
        d2[int(label_istance)] = instance_label_name + "_i"
        labelMapNames_withInstance = d2.items()

        newdict = {
            k: dict(labelMapNames_withInstance)[k]
            for k in label_values
        }

        ax.legend(custom_lines, [lt[1] for lt in newdict.items()],
                  loc='center left',
                  bbox_to_anchor=(0.9, .5),
                  fontsize='x-small')

        return fig2
Example #25
0
 def test_explained_inertia(self):
     mca = prince.MCA(n_components=4, random_state=42)
     mca.fit(self.X)
     self.assertEquals(mca.J, 22)
     np.testing.assert_allclose(mca.explained_inertia_, [.9519, .0168, .0004, 0 ], atol=0.0001)
Example #26
0
    def showNearestNeigh_type_2(self,
                                Sn_inst,
                                fig2,
                                position,
                                reductionMethod="pca",
                                training=False):

        from sklearn import decomposition

        count_inst = self.explain_indices.index(Sn_inst)
        n_inst = int(Sn_inst)
        # Plottarla con un colore diverso
        instTmp2 = Orange.data.Instance(self.explain_dataset.domain,
                                        self.explain_dataset[count_inst])
        c = self.classifier(instTmp2, False)
        small_dataset_len = 150
        if self.training_dataset_len < small_dataset_len:
            self.starting_K = max(
                int(self.mappa_class[self.map_names_class[c[0]]] *
                    self.training_dataset_len), self.K)
        if training == True:
            Kneighbors_data, removeToDo = genNeighborsInfoTraining(
                self.training_dataset, self.NearestNeighborsAll,
                self.explain_dataset.X[count_inst], n_inst, self.starting_K,
                self.unique_filename, self.classifier)
        else:
            Kneighbors_data, removeToDo = gen_neighbors_info(
                self.training_dataset,
                self.NearestNeighborsAll,
                self.explain_dataset[count_inst],
                self.starting_K,
                self.unique_filename,
                self.classifier,
                save=False)

        X = Kneighbors_data.X
        y = Kneighbors_data.Y
        labelledInstance = deepcopy(instTmp2)

        if reductionMethod == "pca":
            pca = decomposition.PCA(n_components=3)
            pca.fit(X)
            X = pca.transform(X)
            istance_transformed = pca.transform([labelledInstance.x])

        elif reductionMethod == "mca":
            import pandas as pd
            import prince

            dataK = []
            for k in range(0, len(Kneighbors_data)):
                dataK.append(Kneighbors_data[k].list)

            columnsA = [i.name for i in Kneighbors_data.domain.variables]

            if Kneighbors_data.domain.metas != ():
                for i in range(0, len(Kneighbors_data.domain.metas)):
                    columnsA.append(Kneighbors_data.domain.metas[i].name)
            data = pd.DataFrame(data=dataK, columns=columnsA)

            columnsA = [i.name for i in Kneighbors_data.domain.attributes]
            Xa = data[columnsA]
            y = Kneighbors_data.Y

            mca = prince.MCA(n_components=3,
                             n_iter=3,
                             copy=True,
                             check_input=True,
                             engine='auto',
                             random_state=42)
            mca.fit(Xa)
            X = mca.transform(Xa)
            istance_transformed = mca.transform([[
                labelledInstance[i].value
                for i in labelledInstance.domain.attributes
            ]])

        elif reductionMethod == "t-sne":
            from sklearn.manifold import TSNE
            XX = np.vstack([X, labelledInstance.x])
            label_istance = float(max(list(self.map_names_class.keys())) + 1)
            yy = np.concatenate((y, np.array([label_istance])))
            tsne = TSNE(n_components=2, random_state=0)
            tsne.fit(XX)
            XX = tsne.fit_transform(XX)

        else:
            print("Reduction method available: pca, t-sne, selected",
                  reductionMethod)
        label_istance = float(max(list(self.map_names_class.keys())) + 1)
        y_l = y.astype(int)
        labelMapNames = self.map_names_class.items()
        instance_label_name = self.map_names_class[int(labelledInstance.y)]

        if reductionMethod == "pca" or reductionMethod == "mca":
            XX = np.vstack([X, istance_transformed])
            yy = np.concatenate((y, np.array([label_istance])))
            ax = fig2.add_subplot(1, 2, position, projection='3d')

            # ax = Axes3D(fig, rect=[0, 0, .7, 1], elev=48, azim=134)
            sc = ax.scatter(XX[:, 0],
                            XX[:, 1],
                            XX[:, 2],
                            c=yy,
                            cmap="Spectral",
                            edgecolor='k')
            ax.w_xaxis.set_ticklabels([])
            ax.w_yaxis.set_ticklabels([])
            ax.w_zaxis.set_ticklabels([])
            label_values = list(np.unique(y_l))
            label_values.append(int(label_istance))
            ax.set_title(self.classifier_name.upper())

        else:
            ax = fig2.add_subplot(1, 2, position)
            sc = ax.scatter(XX[:, 0], XX[:, 1], c=yy, cmap="tab10")
            ax.xaxis.set_ticklabels([])
            ax.yaxis.set_ticklabels([])
            label_values = list(np.unique(yy.astype(int)))
            ax.set_title(self.classifier_name.upper())

        colors = [sc.cmap(sc.norm(i)) for i in label_values]

        d2 = dict(labelMapNames)
        d2[int(label_istance)] = instance_label_name + "_i"
        labelMapNames_withInstance = d2.items()

        newdict = {
            k: dict(labelMapNames_withInstance)[k]
            for k in label_values
        }

        # ax.legend(custom_lines, [lt[1] for lt in newdict.items()],
        #          loc='center left', bbox_to_anchor=(0.9, .5), fontsize = 'x-small')

        return fig2, newdict, colors
Example #27
0
def dim_reduce_init(y,
                    n_clusters,
                    k,
                    r,
                    nj,
                    var_distrib,
                    use_famd=False,
                    seed=None):
    ''' Perform dimension reduction into a continuous r dimensional space and determine 
    the init coefficients in that space
    
    y (numobs x p ndarray): The observations containing categorical variables
    n_clusters (int): The number of clusters to look for in the data
    k (1d array): The number of components of the latent Gaussian mixture layers
    r (int): The dimension of latent variables
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
    var_distrib (p 1darray): An array containing the types of the variables in y 
    use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the 
                    first continuous latent variable. Otherwise MCA is used.
    seed (None): The random state seed to use for the dimension reduction
    ---------------------------------------------------------------------------------------
    returns (dict): All initialisation parameters
    '''

    L = len(k)
    numobs = len(y)
    S = np.prod(k)

    #==============================================================
    # Dimension reduction performed with MCA
    #==============================================================

    if type(y) != pd.core.frame.DataFrame:
        raise TypeError('y should be a dataframe for prince')

    if (np.array(var_distrib) == 'ordinal').all():
        print('PCA init')

        pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\
            rescale_with_std=True, copy=True, check_input=True, engine='auto',\
                random_state = seed)
        z1 = pca.fit_transform(y).values

    elif use_famd:
        famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \
                               engine='auto', random_state = seed)
        z1 = famd.fit_transform(y).values

    else:
        # Check input = False to remove
        mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\
                         check_input=False, engine='auto', random_state = seed)
        z1 = mca.fit_transform(y).values

    z = [z1]
    y = y.values

    #==============================================================
    # Set the shape parameters of each data type
    #==============================================================

    y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\
                               var_distrib == 'binomial')].astype(int)
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\
                              var_distrib == 'binomial')]
    nb_bin = len(nj_bin)

    y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int)
    nj_ord = nj[var_distrib == 'ordinal']
    nb_ord = len(nj_ord)

    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical']
    nb_categ = len(nj_categ)

    # Set y_count standard error to 1
    y_cont = y[:, var_distrib == 'continuous']

    # Before was np.float
    y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True)
    nb_cont = y_cont.shape[1]

    #=======================================================
    # Determining the Gaussian Parameters
    #=======================================================
    init = {}

    eta = []
    H = []
    psi = []
    paths_pred = np.zeros((numobs, L))

    for l in range(L):
        params = get_MFA_params(z[l], k[l], r[l:])
        eta.append(params['eta'][..., n_axis])
        H.append(params['H'])
        psi.append(params['psi'])
        z.append(params['z_nextl'])
        paths_pred[:, l] = params['classes']

    paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0)
    paths, nb_paths = add_missing_paths(k, paths, nb_paths)

    w_s = nb_paths / numobs
    w_s = np.where(w_s == 0, 1E-16, w_s)

    # Check all paths have been explored
    if len(paths) != S:
        raise RuntimeError('Real path len is', S, 'while the initial number', \
                           'of path was only',  len(paths))

    w_s = w_s.reshape(*k).flatten('C')

    #=============================================================
    # Enforcing identifiability constraints over the first layer
    #=============================================================

    H = diagonal_cond(H, psi)
    Ez, AT = compute_z_moments(w_s, eta, H, psi)
    eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)

    init['eta'] = eta
    init['H'] = H
    init['psi'] = psi

    init['w_s'] = w_s  # Probabilities of each path through the network
    init['z'] = z

    # The clustering layer is the one used to perform the clustering
    # i.e. the layer l such that k[l] == n_clusters
    clustering_layer = np.argmax(np.array(k) == n_clusters)

    init[
        'classes'] = paths_pred[:,
                                clustering_layer]  # 0 To change with clustering_layer_idx

    #=======================================================
    # Determining the coefficients of the GLLVM layer
    #=======================================================

    # Determining lambda_bin coefficients.

    lambda_bin = np.zeros((nb_bin, r[0] + 1))

    for j in range(nb_bin):
        Nj = np.max(y_bin[:, j])  # The support of the jth binomial is [1, Nj]

        if Nj == 1:  # If the variable is Bernoulli not binomial
            yj = y_bin[:, j]
            z_new = z[0]
        else:  # If not, need to convert Binomial output to Bernoulli output
            yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0])

        lr = LogisticRegression()

        if j < r[0] - 1:
            lr.fit(z_new[:, :j + 1], yj)
            lambda_bin[j, :j + 2] = np.concatenate(
                [lr.intercept_, lr.coef_[0]])
        else:
            lr.fit(z_new, yj)
            lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]])

    ## Identifiability of bin coefficients
    lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0]

    # Determining lambda_ord coefficients
    lambda_ord = []

    for j in range(nb_ord):
        Nj = len(np.unique(
            y_ord[:, j], axis=0))  # The support of the jth ordinal is [1, Nj]
        yj = y_ord[:, j]

        ol = OrderedLogit()
        ol.fit(z[0], yj)

        ## Identifiability of ordinal coefficients
        beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten()
        lambda_ord_j = np.concatenate([ol.alpha_, beta_j])
        lambda_ord.append(lambda_ord_j)

    # Determining the coefficients of the continuous variables
    lambda_cont = np.zeros((nb_cont, r[0] + 1))

    for j in range(nb_cont):
        yj = y_cont[:, j]
        linr = LinearRegression()

        if j < r[0] - 1:
            linr.fit(z[0][:, :j + 1], yj)
            lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_],
                                                     linr.coef_])
        else:
            linr.fit(z[0], yj)
            lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_])

    ## Identifiability of continuous coefficients
    lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0]

    # Determining lambda_categ coefficients
    lambda_categ = []

    for j in range(nb_categ):
        yj = y_categ[:, j]

        lr = LogisticRegression(multi_class='multinomial')
        lr.fit(z[0], yj)

        ## Identifiability of categ coefficients
        beta_j = lr.coef_ @ AT[0][0]
        lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j]))

    init['lambda_bin'] = lambda_bin
    init['lambda_ord'] = lambda_ord
    init['lambda_cont'] = lambda_cont
    init['lambda_categ'] = lambda_categ

    return init
Example #28
0
    cons_ultim_12(data_out.estrato.values[i], data_out.unidades_ant.values[i])
    for i in range(len(data_out.estrato))
]
unidades_ultim_12 = np.reshape(unidades_ultim_12, ((-1, 13)))
unidades_ultim_12 = pd.DataFrame(
    unidades_ultim_12, columns=["mes_t-" + str(12 - i) for i in range(13)])

print("[INFO] Data chunk 2 ...done")

## modificar estrato a valor numérico
data_out['estrato'] = data_out['estrato'].apply(lambda x: kwh_cost[x][1])
data_aux = data_out[['estrato', 'localidad', 'valor_ant']].copy()
data_aux.valor_ant = [(i - np.min(data_aux.valor_ant)) /
                      (np.max(data_aux.valor_ant) - np.min(data_aux.valor_ant))
                      for i in data_aux.valor_ant]
mca = pr.MCA(n_components=-1).fit_transform(data_aux.values)
gmm = GaussianMixture(n_components=4)
gmm.fit(mca)
labels = gmm.predict(mca)

### Modelo predictivo

### función para medir desempeño


def metrics(real, pred):
    kappa = cohen_kappa_score(real, pred)
    acc = accuracy_score(real, pred)
    f1 = f1_score(real, pred)
    prec = precision_score(real, pred)
    recall = recall_score(real, pred)
Example #29
0
import pandas as pd
import matplotlib.pyplot as plt
import prince

#########################################################################################################
df_dataAquitaine = pd.read_csv("/home/alauzettho/BOAMP/DataScience/data.csv")
df_dataAquitaine = df_dataAquitaine.drop(columns='Unnamed: 0')
print('------------------- DATA AQUITAINE IMPORTED -------------------')
#########################################################################################################

df = df_dataAquitaine[[
    'CPV', 'CP', 'CLASSE_LIBELLE', 'CRITERES_ATTRIBUTION_1',
    'CRITERES_ATTRIBUTION_2'
]]
df = df.dropna()
print(df.shape)

mca = prince.MCA(n_components=2,
                 n_iter=3,
                 copy=True,
                 check_input=True,
                 engine='auto',
                 random_state=42)
mca = mca.fit(df)

ax = mca.plot_coordinates(X=df, ax=None, figsize=(6, 6),  \
            show_row_points=True, row_points_size=10, show_row_labels=False, show_column_points=True, \
            column_points_size=30, show_column_labels=False, legend_n_cols=1)

plt.show()
Example #30
0
 def test_column_contributions(self):
     mca = prince.MCA(n_components=4, random_state=42)
     mca.fit(self.X)
     c_cont = mca.column_contributions()
     pd.testing.assert_index_equal(c_cont.index, mca.col_masses_.index)
     np.testing.assert_allclose(c_cont.sum(axis=0), [1., 1., 1., 1. ], atol=0.0001)