def test_transform_numpy_array(self): groups = { name: [self.X.columns.get_loc(col) for col in cols] for name, cols in self.groups.items() } mfa = prince.MFA(groups=groups) self.assertTrue(isinstance(mfa.fit(self.X.values).transform(self.X.values), pd.DataFrame))
def test_plot_partial_row_coordinates(self): mfa = prince.MFA(groups=self.groups) for col in ['E1 fruity', 'E1 woody', 'E1 coffee']: self.X[col] = self.X[col].astype(str) mfa.fit(self.X) ax = mfa.plot_partial_row_coordinates(self.X) self.assertTrue(isinstance(ax, mpl.axes.Axes))
def test_fit_numpy_array(self): groups = { name: [self.X.columns.get_loc(col) for col in cols] for name, cols in self.groups.items() } mfa = prince.MFA(groups=groups, rescale_with_mean=False, rescale_with_std=False) self.assertTrue(isinstance(mfa.fit(self.X.values), prince.MFA))
'ens': 2 } i = 0 groups = {} for name, n in group_sizes.items(): groups[name] = variables[i:i + n] i += n mfa = prince.MFA( groups=groups, n_components=5, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42 ) mfa = mfa.fit(X) # print('ORIG') # print(mfa.partial_factor_analysis_['orig'].eigenvalues_) # print(mfa.partial_factor_analysis_['orig'].s_) # print('---') # print('VIS') # print(mfa.partial_factor_analysis_['vis'].eigenvalues_) # print(mfa.partial_factor_analysis_['vis'].s_) # print('---')
def Preprocess(data_frame, target=None, method='FAMD', samples=None, mapper=None, num_components=3, scaler=None, encode_method='Binary', target_encoder=None, data_encoder=None, data_columns_dict=None, target_column_dict=None, groups=None, normalization='l2'): # If no target supplied get as target the last column of df if not target: target = data_frame.columns.values.tolist()[-1] ''' TO DO: Fix PCA. ''' if method == 'PCA': print('Dummy is not functionning proberly.') method = 'MFA' normalization = normalization.lower() if normalization not in ['l1', 'l2', 'max', 'standard', None]: print('Not a valid normalization method change to None') normalization = None if samples is not None: # Sample the data set, Split to training and testing sets. train_data = data_frame.loc[samples.iloc[:, :-1].values.flatten(), :] test_data = data_frame.loc[samples.iloc[:, -1].values.flatten(), :] train_target = train_data[target].copy() test_target = test_data[target].copy() train_data = train_data.drop(columns=[target]) test_data = test_data.drop(columns=[target]) # Encode the data sets train_data, data_encoder, data_columns_dict = Fit_Encode( train_data, method=encode_method) test_data, _, _ = Fit_Encode(test_data, mappings=data_encoder, columns_dict=data_columns_dict, method=encode_method) #print('Test','\n',train_data.iloc[0]) train_target, target_encoder, target_column_dict = Fit_Encode( train_target, method=encode_method) test_target, _, _ = Fit_Encode(test_target, mappings=target_encoder, columns_dict=target_column_dict, method=encode_method) else: # If no samples are supplied we process the entire data set as a whole. test_data = data_frame.copy() test_target = test_data[target].copy() test_data = test_data.drop(columns=[target]) test_data, test_data_encoder, test_columns_dict = Fit_Encode( test_data, mappings=data_encoder, columns_dict=data_columns_dict, method=encode_method) print('Test Data Encoded') test_target, test_target_encoder, _ = Fit_Encode( test_target, mappings=target_encoder, columns_dict=target_column_dict, method=encode_method) print('Test targets encoded') # Drop the income column from data sets and get normalized vectors if method == 'MFA': if not groups: groups = {} for key in data_columns_dict.keys(): names = ['_' + s for s in data_columns_dict[key]] column_headers = [x + y for x, y in it.product([key], names)] groups[key] = column_headers if not mapper: # Create FAMD mapper. print('No mapper found') ''' Consider passing **kwargs in Preprocess func. to pass in mappers. ''' mfa = pr.MFA( groups=groups, n_components=num_components, n_iter=100, #rescale_with_mean = True, # Does not work. Can use sklearn Standard scaller. #rescale_with_std = True, copy=True, check_input=True, engine='auto', random_state=None) print('Fitting MFA') if samples is not None: # Vectors for training/test set mapper = mfa.fit(train_data) vecs_train = pd.DataFrame(mapper.row_coordinates(train_data)) vecs_test = pd.DataFrame(mapper.transform(test_data)) vecs_train, scaler = Normalization(vecs_train, normalization, scaler) vecs_test, scaler = Normalization(vecs_test, normalization, scaler) return vecs_train, train_target, vecs_test, test_target, data_columns_dict, target_column_dict, data_encoder, target_encoder, groups, target, mapper, scaler else: # Get the vectors created for the training set and normalise vecs_test = pd.DataFrame(mapper.transform(test_data)) vecs_test, scaler = Normalization(vecs_test, normalization, scaler) ''' Consider returning a single dictionary with all parameters. Each case has different number of returned variables.''' return vecs_test, test_target, test_data_encoder, test_target_encoder, mapper, target, scaler elif method == 'PCA': if not mapper: mapper = pr.PCA(n_components=num_components, n_iter=100, rescale_with_mean=True, rescale_with_std=True, copy=True, check_input=True, engine='auto', random_state=None) if samples is not None: pca_train = mapper.fit(train_data) vecs_train = pd.DataFrame(pca_train.row_coordinates(train_data)) pca_test = mapper.transform(test_data) vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data)) if normalization in ['l1', 'l2', 'max']: scaler = None vecs_train = pd.DataFrame(preprocessing.normalize( vecs_train, norm=normalization, axis=1), columns=vecs_train.columns) vecs_test = pd.DataFrame(preprocessing.normalize( vecs_test, norm=normalization, axis=1), columns=vecs_test.columns) elif normalization == 'standard': scaler = preprocessing.StandardScaler() vecs_train = pd.DataFrame(scaler.fit_transform(vecs_train), columns=vecs_train.columns) vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test), columns=vecs_test.columns) return vecs_train, train_target, vecs_test, test_target, target_encoders, data_endoder, mapper, target, scaler else: test_data, data_endoder = encode_categorical( test_data[target].copy(), encode_method=encode_method, encoder=data_encoder) pca_test = mapper.fit(test_data) vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data)) if normalization in ['l1', 'l2', 'max']: scaler = None vecs_test = pd.DataFrame(preprocessing.normalize( vecs_test, norm=normalization, axis=1), columns=vecs_test.columns) elif normalization == 'standard': scaler = preprocessing.StandardScaler() vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test), columns=vecs_test.columns) return vecs_test, test_target, mapper, target
def test_transform_pandas_dataframe(self): mfa = prince.MFA(groups=self.groups, rescale_with_mean=False, rescale_with_std=False) self.assertTrue( isinstance(mfa.fit(self.X).transform(self.X), pd.DataFrame))
def test_fit_pandas_dataframe(self): mfa = prince.MFA(groups=self.groups, rescale_with_mean=False, rescale_with_std=False) self.assertTrue(isinstance(mfa.fit(self.X), prince.MFA))
def test_transform_pandas_dataframe(self): mfa = prince.MFA(groups=self.groups) self.assertTrue(isinstance(mfa.fit(self.X).transform(self.X), pd.DataFrame))
def test_fit_pandas_dataframe(self): mfa = prince.MFA(groups=self.groups) self.assertTrue(isinstance(mfa.fit(self.X), prince.MFA))
def test_mixed_groups(self): mfa = prince.MFA(groups=self.groups) self.X['E1 fruity'] = self.X['E1 fruity'].astype('category') with self.assertRaises(ValueError): mfa.fit(self.X)
def test_no_groups(self): mfa = prince.MFA() with self.assertRaises(ValueError): mfa.fit(self.X)
# Printing basic info of dataset print ('Number of records:',data.shape[0]) print ('Number of attributes:',data.shape[1]) # Printing Column names print([a for a in data.columns]) raw_data = data.drop(['timestamp','group'],axis=1) # Stadardizing the dataset #std_rawdata = preprocessing.StandardScaler().fit_transform(raw_data) """ mca = prince.MCA(n_components =2, n_iter=3,copy=True,engine='auto') mca = mca.fit(raw_data) """ groups ={'physical':['disengaged','looking','talking','intTech','intRes','intExt'],'logs':['Accessed','Create','Open','Update']} mfa = prince.MFA(groups=groups,n_components = 2) mfa = mfa.fit(raw_data) #mcadf = mca.row_coordinates(raw_data) #mcadf.to_csv('mcaresult.csv') mfadf = mfa.row_coordinates(raw_data) mfadf.to_csv('mfaresult2.csv')
data_ready_8nov = data_8nov.drop(data_8nov.columns[[0,1,2]],axis=1) data_ready_22nov = data_22nov.drop(data_22nov.columns[[0,1,2]],axis=1) data_ready_6dec = data_6dec.drop(data_6dec.columns[[0,1]],axis=1) data_ready_22nov.dropna(axis=0,how="any",inplace=True) data_ready_18oct = pd.DataFrame(scaler1.fit_transform(data_ready_18oct),columns=group_all) data_ready_8nov = pd.DataFrame(scaler2.fit_transform(data_ready_8nov),columns=group_all) data_ready_22nov = pd.DataFrame(scaler3.fit_transform(data_ready_22nov),columns=group_all) data_ready_6dec = pd.DataFrame(scaler4.fit_transform(data_ready_6dec),columns=group_partial) #famd = prince.FAMD(n_components = 2) mfa1 = prince.MFA(groups=groups,n_components=2) mfa2 = prince.MFA(groups=groups,n_components=2) mfa3 = prince.MFA(groups=groups,n_components=2) mfa4 = prince.MFA(groups=groups1,n_components=2) #famd_result = famd.fit_transform(std_data_8nov) mfa_result_18oct = mfa1.fit_transform(data_ready_18oct) mfa_result_8nov = mfa2.fit_transform(data_ready_8nov) mfa_result_22nov = mfa3.fit_transform(data_ready_22nov) mfa_result_6dec = mfa4.fit_transform(data_ready_6dec) #famd.to_csv('famd_result.csv') mfa_result_18oct.to_csv('mfa_result_18oct.csv') mfa_result_8nov.to_csv('mfa_result_8nov.csv') mfa_result_22nov.to_csv('mfa_result_22nov.csv') mfa_result_6dec.to_csv('mfa_result_6dec.csv')