def perform_correspondence_analysis(data_frame): mca = prince.MCA() mca = prince.MCA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) ptumor_mca = mca.fit(data_frame) ax = ptumor_mca.plot_coordinates(X=data_frame, ax=None, figsize=(10, 10), show_row_points=False, row_points_size=0, show_row_labels=False, show_column_points=True, column_points_size=30, show_column_labels=True, legend_n_cols=1).legend( loc='center left', bbox_to_anchor=(1, 0.5)) plt.show()
def do_mca(self): import prince x_train, x_test, y_train, y_test = self.scale_data() mca = prince.MCA(n_components=12, random_state=42) x_train = mca.fit_transform(x_train) x_test = mca.fit(x_test) return x_train, x_test, y_train, y_test
def peformMCA(state): global persondatadf, mca_res cat_data = persondatadf[['SCHL', 'RAC1P', 'ESR', 'ST']] if state > 0: cat_data = cat_data[cat_data['ST'] == state] cat_data = cat_data.sample(frac=0.1) cat_data.drop(columns="ST", inplace=True) cat_data['RAC1P'] = cat_data['RAC1P'].apply(lambda race: races.get(race)) cat_data['SCHL'] = cat_data['SCHL'].apply(lambda deg: degrees.get(deg)) cat_data['ESR'] = cat_data['ESR'].apply(lambda emp: employment.get(emp)) cat_data = cat_data.fillna("Unknown") mca = prince.MCA( n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42 ) mca = mca.fit(cat_data) mca_res = mca.column_coordinates(cat_data) mca_res.reset_index(inplace=True) types = { "SCHL": 1, "RAC1P": 2, "ESR": 3 } mca_res['type'] = mca_res['index'].apply(lambda idx: types.get(idx.split('_')[0])) mca_res['index'] = mca_res['index'].apply(lambda idx: idx.split('_')[1]) mca_res.rename(columns={0: 'x', 1: 'y', index: 'cat'}, inplace=True)
def generate_mca(df, save_path=None): """ :param df: dataframe entered that contains categorical variables but can also contain numerical ones :param save_path: path for saving the figure :return: """ cat_feature_list = [ x for x in df.columns if x not in df._get_numeric_data().columns ] df_cc_cat = df[cat_feature_list] mca = prince.MCA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) mca.fit(df_cc_cat.astype('category')) mca.plot_coordinates(X=df_cc_cat.astype('category'), ax=None, figsize=(7, 7), show_row_points=False, row_points_size=30, show_row_labels=False, show_column_points=True, column_points_size=40, show_column_labels=True, legend_n_cols=2) if save_path is not None: plt.savefig(save_path / Path('mca.png')) plt.close('all')
def mca(self, df, cols): """ cols: list of categorical column names Do a multiple correspondence analysis """ mca = prince.MCA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto') mca.fit(df[cols]) fig, ax = plt.subplots(figsize=(10, 10)) mca.plot_coordinates(X=df[cols], ax=ax, show_row_points=True, row_points_size=10, show_row_labels=False, show_column_points=True, column_points_size=30, show_column_labels=False, legend_n_cols=1) plt.show() print("Explained inertia:", mca.explained_inertia_) return mca
def test_row_cos2(self): mca = prince.MCA(n_components=4, random_state=42) mca.fit(self.X) r_cos2 = mca.row_cos2() self.assertEquals(r_cos2.shape, (6, 4)) pd.testing.assert_index_equal(r_cos2.index, mca.row_masses_.index) self.assert_(np.all((r_cos2 >= 0) & (r_cos2 <= 1)), "All Cos2 should be between 0 and 1") self.assert_(np.all(r_cos2.sum(axis=1) <= 1), "Cos2 across dimensions should be near 1")
def test_column_cos2(self): mca = prince.MCA(n_components=4, random_state=42) mca.fit(self.X) c_cos2 = mca.column_cos2() self.assertEquals(c_cos2.shape, (22, 4)) pd.testing.assert_index_equal(c_cos2.index, mca.col_masses_.index) self.assert_(np.all((c_cos2 >= 0) & (c_cos2 <= 1)), "All Cos2 should be between 0 and 1") # Should be really <= 1., but account for floating precision error self.assert_(np.all(c_cos2.sum(axis=1) <= 1.000001), "Cos2 across dimensions should be near 1")
def categorical_MCA(self, X, n_components=4, n_iter=1000): X = X.copy() cat_data, non_cat_data = self.categorize_data(X) mca = prince.MCA(n_components=n_components, n_iter=n_iter) cat_reduced = mca.fit_transform(cat_data) cat_reduced.columns = [ "MCA_{}".format(i + 1) for i, v in enumerate(cat_reduced.columns) ] return pd.concat([cat_reduced, non_cat_data], axis=1)
def prince_mca(X, facet1n2_sample_ids, facet1_sample_ids, facet2_sample_ids, pairID, unique_samples, unique_facets): facet1_trunc = facet1[0:15] facet2_trunc = facet2[0:15] outF.write('Started prince_mca module...' + '\n') df = pd.DataFrame(data=X, index=unique_samples, columns=unique_facets) mca = prince.MCA(df, n_components=2) v = mca.n_rows row_principal_coordinates = mca.row_principal_coordinates row_principal_coordinates.columns = ['x', 'y'] # adding info to df for graph drawing row_principal_coordinates.index.name = 'id' row_principal_coordinates['Attribute'] = 0 row_principal_coordinates.loc[ row_principal_coordinates.index.isin(facet1_sample_ids), 'Attribute'] = facet1 row_principal_coordinates.loc[ row_principal_coordinates.index.isin(facet2_sample_ids), 'Attribute'] = facet2 row_principal_coordinates.loc[ row_principal_coordinates.index.isin(facet1n2_sample_ids), 'Attribute'] = 'both' # adds column describing spot frequency for depth visualisation # row_principal_coordinates['s'] = row_principal_coordinates.groupby(['x','y']).transform('count') print(row_principal_coordinates.groupby(['x', 'y']).transform('count')) sys.exit() # # To see data input # print(row_principal_coordinates) mcaname = str('data/plots/mca_' + str(pairID) + '.dat') row_principal_coordinates.to_csv(mcaname) # try: # # producing 3D dimension reduction for mancluster.py (generally turned off in mancluster too) # mca3 = prince.MCA(df, n_components=3) # mca3_df = mca3.row_principal_coordinates # mca3_df.columns = ['x', 'y', 'z'] # mca3_df.index.name = 'id' # mca3_df.loc[mca3_df.index.isin(facet1_sample_ids), 'Attribute'] = facet1 # mca3_df.loc[mca3_df.index.isin(facet2_sample_ids), 'Attribute'] = facet2 # mca3_df.loc[mca3_df.index.isin(facet1n2_sample_ids), 'Attribute'] = 'both' # mca3_df['s'] = mca3_df.groupby(['x','y', 'z']).transform('count') # mca3name = str('data/plots/mca3d_' + str(pairID) + '.dat') # mca3_df.to_csv(mca3name) # except ValueError: # an unknown issue which rarely occurs to reproduce run phenotypes and phenotype # continue return (row_principal_coordinates, mcaname)
def MCA(X): #X = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data') #X.columns = ['Color', 'Size', 'Action', 'Age', 'Inflated'] print(X.head()) mca = prince.MCA() mca = mca.fit(X) # same as calling ca.fs_r(1) mca = mca.transform( X) # same as calling ca.fs_r_sup(df_new) for *another* test set. return mca
def dimension_reduction_pca_mca(self, df): ### MCA - Categorical features print("--------------------------") print(df.head()) cat_cols = [] if self.one_hot is True: cat_cols = self.encoders.get_feature_names( self.categorical_features_initial) + ["exist_closed"] else: cat_cols = self.categorical_features_final print(cat_cols) # If my_mca is None, it means it is df_train. So we fit my_mca n_comp = len(cat_cols) if self.my_mca is None: self.my_mca = prince.MCA(n_components=n_comp) self.my_mca.fit(df[cat_cols]) cols = [] for i in range(n_comp): cols += ["MCA_" + str(i + 1)] print(df[cat_cols].shape) print(self.my_mca) aux_mca = self.my_mca.transform(df[cat_cols]) aux_mca.columns = cols df = df.drop(cat_cols, axis=1) df = df.join(aux_mca) ## PCA - Numerical features # If my_pca is 0, it means it is df_train. So we fit my_pca num_cols = self.numerical_features_final n_comp = len(num_cols) print(num_cols) if self.my_pca is None: self.my_pca = PCA(n_components=n_comp) self.my_pca.fit(df[num_cols]) cols = [] for i in range(n_comp): cols += ["PCA_" + str(i + 1)] print(df.head()) print(num_cols) aux_pca = pd.DataFrame(self.my_pca.transform(df[num_cols]), columns=cols) df = df.drop(num_cols, axis=1) df = df.join(aux_pca) print(self.my_pca.explained_variance_ratio_) print(df.head()) return df
def MCAX(data, cat_columns): """ cat_columns: list of categorical columns """ X = data[[cat_columns]] print(X.head()) mca = prince.MCA() mca = mca.fit(X) # same as calling ca.fs_r(1) mca = mca.transform( X) # same as calling ca.fs_r_sup(df_new) for *another* test set. print(mca) return mca def OneHotEncode(self, data, column): encoder = ce.OneHotEncoder(cols=[column], return_df=True) return encoder.fit_transform(data) # High Cardinality of Features def FeatureHasher(selfself, data, column, components): encoder = ce.HashingEncoder(cols=column, n_components=components) return encoder.fit_transform(data) def LabelEncoding(self, data, column): encoder = ce.OrdinalEncoder(cols=[column], return_df=True) return encoder.fit_transform(data) # High Cardinality of Features def BinaryEncoder(self, data, column): encoder = ce.BinaryEncoder(cols=[column], return_df=True) return encoder.fit_transform(data) def feature_encode(self, data): l = [] for col in data.columns: n = data.groupby([col]) if n.ngroups > 10000: continue if n.ngroups < 10000 and n.ngroups > 1000: l.append(self.BinaryEncoder(data, col, 25)) continue if n.ngroups < 1000 and n.groups > 10: l.append(self.FeatureHasher(data, col, 15)) continue
def Type_Conversion(Stats_Master): PCA_Master = Stats_Master.copy() PCA_Type = PCA_Master[['Type_1', 'Type_2']].fillna(value='None') mca = prince.MCA(n_components=15, n_iter=100, copy=False, engine='auto', random_state=42) Typed_MCA = mca.fit(PCA_Type) print(np.sum(Typed_MCA.explained_inertia_)) mca_Components = Typed_MCA.U_ df = pd.DataFrame(mca_Components, index=mca_Components[:, 0]).reset_index() df = df.drop(columns=['index']) return df
def do_preprocess_3(df, target_col_name, plots_path, suffix): # ------------------------------------------------------------------------------------ # # Do data pre-processing with second approach of dimensionality reduction by taking # finding the similarity between various levels with clustering technique and using # the cluster numbers instead of the actual levels. # ------------------------------------------------------------------------------------ # # Drop a few columns as they should not have any influence on the target # ------------------------------------------------------------------------------------ # df = drop_cols(df, ["userid", "doctorid", "transdate"]) logging.info(df.describe()) logging.info(df.dtypes) # Since all the columns are categorical attributes convert to the appropriate # categorical type for col in df.columns: df[col] = df[col].astype('category') logging.info('Dimensionality Reduction with MCA') mca = prince.MCA(df, n_components=1100) # logging.info('principal components are :: '+str(mca.categorical_columns)) # logging.info('principal components are :: ' + str(mca.column_component_contributions)) # logging.info('principal components are :: ' + str(mca.column_correlations)) # logging.info('principal components are :: ' + str(mca.column_cosine_similarities)) print('MCA is :: ', mca) logging.info('principal components are :: ' + str(mca.eigenvalues)) logging.info('column_correlations are :: ' + str(mca.column_correlations)) logging.info('cumulative_explained_inertia are :: ' + str(mca.cumulative_explained_inertia)) logging.info('explained_inertia are :: ' + str(mca.explained_inertia)) logging.info('cumulative_explained_inertia are :: ' + str(mca.row_cosine_similarities)) logging.info(' row_principal_coordinates are :: ' + str(mca.row_principal_coordinates)) # logging.info('principal components are :: ' + str(mca.column_standard_coordinates)) # mca.plot_rows(show_points=True, show_labels=False, ellipse_fill=True) # mca.plot_relationship_square() mca.plot_cumulative_inertia(threshold=0.8) # plt.savefig(str(plots_path) + 'MCA_Analysis_Cumulative_Inertia_'+suffix + '.png') print(mca.head()) logging.info("Pre-processed data frame :: ") logging.info(df.describe()) logging.info(df.dtypes)
def do_MCA(X, n_components=10): ''' Performs multiple correspondance analysis on X ''' warnings.filterwarnings("ignore", category=FutureWarning) # run the MCA using prince mca = prince.MCA(n_components=n_components) mca = mca.fit(X) # individual loadings onto components mca.ind_scores = mca.row_coordinates(X).values # edge loadings onto components edge_scores = mca.column_coordinates(X).values # exclude every other row (the zero loadings) mca.edge_scores = edge_scores[1::2, :] return mca
def SampleAndMCA(): FIELDS = { 'Council District': True, 'Report Day': True, 'Clearance Day': True, 'Highest NIBRS/UCR Offense Description': True, 'GO Location Zip': True, '_id': False, "Clearance Status": True } connection = MongoClient(MONGODB_HOST, MONGODB_PORT) collection = connection[DBS_NAME][COLLECTION_NAME] projects = collection.find(projection=FIELDS) sample = [] cnt = 0 limit = 10000 for project in projects: if (cnt < limit): sample.append(project) else: idx = random.randint(0, cnt + 1) if (idx < limit): sample[idx] = project cnt = cnt + 1 df = pd.DataFrame(sample) mca = prince.MCA(df, n_components=2) fig1, ax1 = mca.plot_cumulative_inertia() fig3, ax3 = mca.plot_rows_columns() fig4, ax4 = mca.plot_relationship_square() plt.show() return json.dumps("Sampling and Multiple Component Analysis done")
def test_plot_show_column_labels(self): mca = prince.MCA(n_components=2) mca.fit(self.X) ax = mca.plot_coordinates(self.X, show_column_labels=True) self.assertTrue(isinstance(ax, mpl.axes.Axes))
def test_pandas_dataframe(self): mca = prince.MCA(n_components=2) self.assertTrue(isinstance(mca.fit(self.X), prince.MCA)) self.assertTrue(isinstance(mca.transform(self.X), pd.DataFrame))
def test_numpy_array(self): mca = prince.MCA(n_components=2) self.assertTrue(isinstance(mca.fit(self.X.to_numpy()), prince.MCA)) self.assertTrue( isinstance(mca.transform(self.X.to_numpy()), pd.DataFrame))
df = pd.read_csv('data/datalab_persona_run1_with_scale_cat_classless.csv') df2 = pd.read_csv('data/datalab_persona_run1_with_scale_cat.csv') df_class = df2['class'].values cols = ['g' if x=='smoker' else 'b' for x in df_class] # df = pd.read_csv('data/ogm.csv') # cols = [x for x in df2.columns.values if # x not in ['Age Next at DOC', 'Height', 'Weight', 'Annual Salary', 'Travel %']] # df = pd.get_dummies(df) mca = prince.MCA(df, n_components=-1) # Set the axes you want to examine below, i.e. which component pair you are interested in - (0, 1) vals = mca.row_principal_coordinates print(len(vals)) vals=vals.values plt.scatter(vals[:,0], vals[:,1], c=cols) mca = prince.MCA(df2, n_components=-1) # Set the axes you want to examine below, i.e. which component pair you are interested in - (0, 1)
def test_eigenvalues_are_corrected(self): mca = prince.MCA(n_components=4, random_state=42) mca.fit(self.X) self.assertEquals(mca.K, 10) np.testing.assert_allclose(mca.eigenvalues_, [.7004, .0123, .0003, 0 ], atol=0.0001)
tend_doy = pd.to_datetime(row['tend']).dayofyear #calculate the median DOY doy_mean = int((tstart_doy + tend_doy) / 2) doy_mean_year.append(doy_mean) doy_mean_year = np.asarray(doy_mean_year) doy_mean_year = doy_mean_year.reshape(-1, 1) # Encoding mean DOY with detection enc = OneHotEncoder(handle_unknown='ignore') enc.fit(doy_mean_year) doy_mean_enc = enc.transform(doy_mean_year).toarray() # MCA of detected DOYs mca = prince.MCA() mca = mca.fit(doy_mean_enc) mca = mca.transform(doy_mean_enc) mca_doy = np.array(mca) #################################################################### ## Stacking of variables of each features: # - number of detections # - avg intensity # - started month of detection stack = np.column_stack((det_int, mca_doy)) # Principal Component Analysis (PCA) # Reducing the dimensions of stack from four to two pca = PCA(n_components=2) principal_comp = pca.fit_transform(stack)
def test_total_inertia(self): mca = prince.MCA(n_components=4, random_state=42) mca.fit(self.X) np.testing.assert_almost_equal(mca.total_inertia_, 0.7130, 4)
def visualizePoints_comparison(self, Sn_inst, datapoints, fig2, position, reductionMethod="pca", training=False): from sklearn import decomposition count_inst = self.explain_indices.index(Sn_inst) n_inst = int(Sn_inst) instTmp2 = Orange.data.Instance(self.explain_dataset.domain, self.explain_dataset[count_inst]) c = self.classifier(instTmp2, False) labelledInstance = deepcopy(instTmp2) X = datapoints.X y = datapoints.Y if reductionMethod == "pca": pca = decomposition.PCA(n_components=3) pca.fit(X) X = pca.transform(X) istance_transformed = pca.transform([labelledInstance.x]) elif reductionMethod == "mca": import pandas as pd import prince dataK = [] for k in range(0, len(datapoints)): dataK.append(datapoints[k].list) columnsA = [i.name for i in datapoints.domain.variables] if datapoints.domain.metas != (): for i in range(0, len(datapoints.domain.metas)): columnsA.append(datapoints.domain.metas[i].name) data = pd.DataFrame(data=dataK, columns=columnsA) columnsA = [i.name for i in datapoints.domain.attributes] Xa = data[columnsA] y = datapoints.Y mca = prince.MCA(n_components=3, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) mca.fit(Xa) X = mca.transform(Xa) istance_transformed = mca.transform([[ labelledInstance[i].value for i in labelledInstance.domain.attributes ]]) elif reductionMethod == "t-sne": from sklearn.manifold import TSNE XX = np.vstack([X, labelledInstance.x]) label_istance = float(max(list(self.map_names_class.keys())) + 1) yy = np.concatenate((y, np.array([label_istance]))) tsne = TSNE(n_components=2, random_state=0) tsne.fit(XX) XX = tsne.fit_transform(XX) else: print("Reduction method available: pca, t-sne, selected", reductionMethod) label_istance = float(max(list(self.map_names_class.keys())) + 1) y_l = y.astype(int) labelMapNames = self.map_names_class.items() instance_label_name = self.map_names_class[int(labelledInstance.y)] if reductionMethod == "pca" or reductionMethod == "mca": XX = np.vstack([X, istance_transformed]) yy = np.concatenate((y, np.array([label_istance]))) ax = fig2.add_subplot(1, 2, position, projection='3d') # ax = Axes3D(fig, rect=[0, 0, .7, 1], elev=48, azim=134) sc = ax.scatter(XX[:, 0], XX[:, 1], XX[:, 2], c=yy, cmap="Spectral", edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) label_values = list(np.unique(y_l)) label_values.append(int(label_istance)) else: ax = fig2.add_subplot(1, 2, position) sc = ax.scatter(XX[:, 0], XX[:, 1], c=yy, cmap="tab10") ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) label_values = list(np.unique(yy.astype(int))) colors = [sc.cmap(sc.norm(i)) for i in label_values] custom_lines = [ plt.Line2D([], [], ls="", marker='.', mec='k', mfc=c, mew=.1, ms=20) for c in colors ] d2 = dict(labelMapNames) d2[int(label_istance)] = instance_label_name + "_i" labelMapNames_withInstance = d2.items() newdict = { k: dict(labelMapNames_withInstance)[k] for k in label_values } ax.legend(custom_lines, [lt[1] for lt in newdict.items()], loc='center left', bbox_to_anchor=(0.9, .5), fontsize='x-small') return fig2
def test_explained_inertia(self): mca = prince.MCA(n_components=4, random_state=42) mca.fit(self.X) self.assertEquals(mca.J, 22) np.testing.assert_allclose(mca.explained_inertia_, [.9519, .0168, .0004, 0 ], atol=0.0001)
def showNearestNeigh_type_2(self, Sn_inst, fig2, position, reductionMethod="pca", training=False): from sklearn import decomposition count_inst = self.explain_indices.index(Sn_inst) n_inst = int(Sn_inst) # Plottarla con un colore diverso instTmp2 = Orange.data.Instance(self.explain_dataset.domain, self.explain_dataset[count_inst]) c = self.classifier(instTmp2, False) small_dataset_len = 150 if self.training_dataset_len < small_dataset_len: self.starting_K = max( int(self.mappa_class[self.map_names_class[c[0]]] * self.training_dataset_len), self.K) if training == True: Kneighbors_data, removeToDo = genNeighborsInfoTraining( self.training_dataset, self.NearestNeighborsAll, self.explain_dataset.X[count_inst], n_inst, self.starting_K, self.unique_filename, self.classifier) else: Kneighbors_data, removeToDo = gen_neighbors_info( self.training_dataset, self.NearestNeighborsAll, self.explain_dataset[count_inst], self.starting_K, self.unique_filename, self.classifier, save=False) X = Kneighbors_data.X y = Kneighbors_data.Y labelledInstance = deepcopy(instTmp2) if reductionMethod == "pca": pca = decomposition.PCA(n_components=3) pca.fit(X) X = pca.transform(X) istance_transformed = pca.transform([labelledInstance.x]) elif reductionMethod == "mca": import pandas as pd import prince dataK = [] for k in range(0, len(Kneighbors_data)): dataK.append(Kneighbors_data[k].list) columnsA = [i.name for i in Kneighbors_data.domain.variables] if Kneighbors_data.domain.metas != (): for i in range(0, len(Kneighbors_data.domain.metas)): columnsA.append(Kneighbors_data.domain.metas[i].name) data = pd.DataFrame(data=dataK, columns=columnsA) columnsA = [i.name for i in Kneighbors_data.domain.attributes] Xa = data[columnsA] y = Kneighbors_data.Y mca = prince.MCA(n_components=3, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) mca.fit(Xa) X = mca.transform(Xa) istance_transformed = mca.transform([[ labelledInstance[i].value for i in labelledInstance.domain.attributes ]]) elif reductionMethod == "t-sne": from sklearn.manifold import TSNE XX = np.vstack([X, labelledInstance.x]) label_istance = float(max(list(self.map_names_class.keys())) + 1) yy = np.concatenate((y, np.array([label_istance]))) tsne = TSNE(n_components=2, random_state=0) tsne.fit(XX) XX = tsne.fit_transform(XX) else: print("Reduction method available: pca, t-sne, selected", reductionMethod) label_istance = float(max(list(self.map_names_class.keys())) + 1) y_l = y.astype(int) labelMapNames = self.map_names_class.items() instance_label_name = self.map_names_class[int(labelledInstance.y)] if reductionMethod == "pca" or reductionMethod == "mca": XX = np.vstack([X, istance_transformed]) yy = np.concatenate((y, np.array([label_istance]))) ax = fig2.add_subplot(1, 2, position, projection='3d') # ax = Axes3D(fig, rect=[0, 0, .7, 1], elev=48, azim=134) sc = ax.scatter(XX[:, 0], XX[:, 1], XX[:, 2], c=yy, cmap="Spectral", edgecolor='k') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([]) label_values = list(np.unique(y_l)) label_values.append(int(label_istance)) ax.set_title(self.classifier_name.upper()) else: ax = fig2.add_subplot(1, 2, position) sc = ax.scatter(XX[:, 0], XX[:, 1], c=yy, cmap="tab10") ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) label_values = list(np.unique(yy.astype(int))) ax.set_title(self.classifier_name.upper()) colors = [sc.cmap(sc.norm(i)) for i in label_values] d2 = dict(labelMapNames) d2[int(label_istance)] = instance_label_name + "_i" labelMapNames_withInstance = d2.items() newdict = { k: dict(labelMapNames_withInstance)[k] for k in label_values } # ax.legend(custom_lines, [lt[1] for lt in newdict.items()], # loc='center left', bbox_to_anchor=(0.9, .5), fontsize = 'x-small') return fig2, newdict, colors
def dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, use_famd=False, seed=None): ''' Perform dimension reduction into a continuous r dimensional space and determine the init coefficients in that space y (numobs x p ndarray): The observations containing categorical variables n_clusters (int): The number of clusters to look for in the data k (1d array): The number of components of the latent Gaussian mixture layers r (int): The dimension of latent variables nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable var_distrib (p 1darray): An array containing the types of the variables in y use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the first continuous latent variable. Otherwise MCA is used. seed (None): The random state seed to use for the dimension reduction --------------------------------------------------------------------------------------- returns (dict): All initialisation parameters ''' L = len(k) numobs = len(y) S = np.prod(k) #============================================================== # Dimension reduction performed with MCA #============================================================== if type(y) != pd.core.frame.DataFrame: raise TypeError('y should be a dataframe for prince') if (np.array(var_distrib) == 'ordinal').all(): print('PCA init') pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\ rescale_with_std=True, copy=True, check_input=True, engine='auto',\ random_state = seed) z1 = pca.fit_transform(y).values elif use_famd: famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \ engine='auto', random_state = seed) z1 = famd.fit_transform(y).values else: # Check input = False to remove mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\ check_input=False, engine='auto', random_state = seed) z1 = mca.fit_transform(y).values z = [z1] y = y.values #============================================================== # Set the shape parameters of each data type #============================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')].astype(int) nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int) nj_ord = nj[var_distrib == 'ordinal'] nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'] nb_categ = len(nj_categ) # Set y_count standard error to 1 y_cont = y[:, var_distrib == 'continuous'] # Before was np.float y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True) nb_cont = y_cont.shape[1] #======================================================= # Determining the Gaussian Parameters #======================================================= init = {} eta = [] H = [] psi = [] paths_pred = np.zeros((numobs, L)) for l in range(L): params = get_MFA_params(z[l], k[l], r[l:]) eta.append(params['eta'][..., n_axis]) H.append(params['H']) psi.append(params['psi']) z.append(params['z_nextl']) paths_pred[:, l] = params['classes'] paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0) paths, nb_paths = add_missing_paths(k, paths, nb_paths) w_s = nb_paths / numobs w_s = np.where(w_s == 0, 1E-16, w_s) # Check all paths have been explored if len(paths) != S: raise RuntimeError('Real path len is', S, 'while the initial number', \ 'of path was only', len(paths)) w_s = w_s.reshape(*k).flatten('C') #============================================================= # Enforcing identifiability constraints over the first layer #============================================================= H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) init['eta'] = eta init['H'] = H init['psi'] = psi init['w_s'] = w_s # Probabilities of each path through the network init['z'] = z # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters clustering_layer = np.argmax(np.array(k) == n_clusters) init[ 'classes'] = paths_pred[:, clustering_layer] # 0 To change with clustering_layer_idx #======================================================= # Determining the coefficients of the GLLVM layer #======================================================= # Determining lambda_bin coefficients. lambda_bin = np.zeros((nb_bin, r[0] + 1)) for j in range(nb_bin): Nj = np.max(y_bin[:, j]) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:, j] z_new = z[0] else: # If not, need to convert Binomial output to Bernoulli output yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0]) lr = LogisticRegression() if j < r[0] - 1: lr.fit(z_new[:, :j + 1], yj) lambda_bin[j, :j + 2] = np.concatenate( [lr.intercept_, lr.coef_[0]]) else: lr.fit(z_new, yj) lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]]) ## Identifiability of bin coefficients lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0] # Determining lambda_ord coefficients lambda_ord = [] for j in range(nb_ord): Nj = len(np.unique( y_ord[:, j], axis=0)) # The support of the jth ordinal is [1, Nj] yj = y_ord[:, j] ol = OrderedLogit() ol.fit(z[0], yj) ## Identifiability of ordinal coefficients beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten() lambda_ord_j = np.concatenate([ol.alpha_, beta_j]) lambda_ord.append(lambda_ord_j) # Determining the coefficients of the continuous variables lambda_cont = np.zeros((nb_cont, r[0] + 1)) for j in range(nb_cont): yj = y_cont[:, j] linr = LinearRegression() if j < r[0] - 1: linr.fit(z[0][:, :j + 1], yj) lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_], linr.coef_]) else: linr.fit(z[0], yj) lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_]) ## Identifiability of continuous coefficients lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0] # Determining lambda_categ coefficients lambda_categ = [] for j in range(nb_categ): yj = y_categ[:, j] lr = LogisticRegression(multi_class='multinomial') lr.fit(z[0], yj) ## Identifiability of categ coefficients beta_j = lr.coef_ @ AT[0][0] lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j])) init['lambda_bin'] = lambda_bin init['lambda_ord'] = lambda_ord init['lambda_cont'] = lambda_cont init['lambda_categ'] = lambda_categ return init
cons_ultim_12(data_out.estrato.values[i], data_out.unidades_ant.values[i]) for i in range(len(data_out.estrato)) ] unidades_ultim_12 = np.reshape(unidades_ultim_12, ((-1, 13))) unidades_ultim_12 = pd.DataFrame( unidades_ultim_12, columns=["mes_t-" + str(12 - i) for i in range(13)]) print("[INFO] Data chunk 2 ...done") ## modificar estrato a valor numérico data_out['estrato'] = data_out['estrato'].apply(lambda x: kwh_cost[x][1]) data_aux = data_out[['estrato', 'localidad', 'valor_ant']].copy() data_aux.valor_ant = [(i - np.min(data_aux.valor_ant)) / (np.max(data_aux.valor_ant) - np.min(data_aux.valor_ant)) for i in data_aux.valor_ant] mca = pr.MCA(n_components=-1).fit_transform(data_aux.values) gmm = GaussianMixture(n_components=4) gmm.fit(mca) labels = gmm.predict(mca) ### Modelo predictivo ### función para medir desempeño def metrics(real, pred): kappa = cohen_kappa_score(real, pred) acc = accuracy_score(real, pred) f1 = f1_score(real, pred) prec = precision_score(real, pred) recall = recall_score(real, pred)
import pandas as pd import matplotlib.pyplot as plt import prince ######################################################################################################### df_dataAquitaine = pd.read_csv("/home/alauzettho/BOAMP/DataScience/data.csv") df_dataAquitaine = df_dataAquitaine.drop(columns='Unnamed: 0') print('------------------- DATA AQUITAINE IMPORTED -------------------') ######################################################################################################### df = df_dataAquitaine[[ 'CPV', 'CP', 'CLASSE_LIBELLE', 'CRITERES_ATTRIBUTION_1', 'CRITERES_ATTRIBUTION_2' ]] df = df.dropna() print(df.shape) mca = prince.MCA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) mca = mca.fit(df) ax = mca.plot_coordinates(X=df, ax=None, figsize=(6, 6), \ show_row_points=True, row_points_size=10, show_row_labels=False, show_column_points=True, \ column_points_size=30, show_column_labels=False, legend_n_cols=1) plt.show()
def test_column_contributions(self): mca = prince.MCA(n_components=4, random_state=42) mca.fit(self.X) c_cont = mca.column_contributions() pd.testing.assert_index_equal(c_cont.index, mca.col_masses_.index) np.testing.assert_allclose(c_cont.sum(axis=0), [1., 1., 1., 1. ], atol=0.0001)