def test_analyze_rotation_value_error(self): data = pd.DataFrame({ 'A': [2, 4, 5, 6, 8, 9], 'B': [4, 8, np.nan, 10, 16, 18], 'C': [6, 12, 15, 12, 26, 27] }) fa = FactorAnalyzer(rotation='blah', n_factors=1) fa.fit(data)
def test_analyze_infinite(self): data = pd.DataFrame({ 'A': [2, 4, 5, 6, 8, 9], 'B': [4, 8, float('inf'), 10, 16, 18], 'C': [6, 12, 15, 12, 26, 27] }) fa = FactorAnalyzer() fa.analyze(data, 1, impute='drop')
def test_analyze_impute_value_error(self): data = pd.DataFrame({ 'A': [2, 4, 5, 6, 8, 9], 'B': [4, 8, np.nan, 10, 16, 18], 'C': [6, 12, 15, 12, 26, 27] }) fa = FactorAnalyzer() fa.analyze(data, 1, rotation=None, impute='blah')
def test_analyze_infinite(self): data = pd.DataFrame( { 'A': [1.0, 0.4, 0.5], 'B': [0.4, 1.0, float('inf')], 'C': [0.5, float('inf'), 1.0] }, index=['A', 'B', 'C']) fa = FactorAnalyzer(impute='drop', n_factors=1, is_corr_matrix=True) fa.fit(data)
def test_analyze_impute_drop(self): data = pd.DataFrame({ 'A': [2, 4, 5, 6, 8, 9], 'B': [4, 8, np.nan, 10, 16, 18], 'C': [6, 12, 15, 12, 26, 27] }) expected = data.copy() expected = expected.dropna() expected_corr = expected.corr() fa = FactorAnalyzer() fa.analyze(data, 1, rotation=None, impute='drop') assert_frame_equal(fa.corr, expected_corr)
def test_analyze_weights(self): data = pd.DataFrame({ 'A': [2, 4, 5, 6, 8, 9], 'B': [4, 8, 9, 10, 16, 18], 'C': [6, 12, 15, 12, 26, 27] }) fa = FactorAnalyzer(rotation=None) fa.fit(data) _ = fa.transform(data) expected_weights = np.array(([[0.33536334, -2.72509646, 0], [0.33916605, -0.29388849, 0], [0.33444588, 3.03060826, 0]])) assert_array_almost_equal(expected_weights, fa.weights_)
def test_analyze_impute_drop(self): data = pd.DataFrame({ 'A': [2, 4, 5, 6, 8, 9], 'B': [4, 8, np.nan, 10, 16, 18], 'C': [6, 12, 15, 12, 26, 27] }) expected = data.copy() expected = expected.dropna() expected_corr = expected.corr() expected_corr = expected_corr.values fa = FactorAnalyzer(rotation=None, impute='drop', n_factors=1) fa.fit(data) assert_array_almost_equal(fa.corr_, expected_corr)
def test_remove_all_columns(self): # test that columns with string values are removed. data = pd.DataFrame({'A': ['1', 2, 3, 4, 5], 'B': [6, 7, 8, 9, '10']}) result = FactorAnalyzer().remove_non_numeric(data) assert result.empty
def test_remove_one_column(self): # test that only column with string is removed. data = pd.DataFrame({'A': ['1', 2, 3, 4, 5], 'B': [6, 7, 8, 9, 10]}) expected = pd.DataFrame({'B': [6, 7, 8, 9, 10]}) result = FactorAnalyzer().remove_non_numeric(data) assert_frame_equal(expected, result)
def test_remove_no_columns(self): # test that no numeric columns are removed. data = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': [6.1, 7.2, 8.4, 9.2, 10.1] }) result = FactorAnalyzer().remove_non_numeric(data) assert_frame_equal(data, result)
def test_factor_variance(self): path = 'tests/data/test01.csv' data = pd.read_csv(path) fa = FactorAnalyzer(n_factors=3, rotation=None) fa.fit(data) loadings = fa.loadings_ n_rows = loadings.shape[0] # calculate variance loadings = loadings**2 variance = np.sum(loadings, axis=0) # calculate proportional variance proportional_variance_expected = variance / n_rows proportional_variance = fa.get_factor_variance()[1] assert_almost_equal(proportional_variance_expected, proportional_variance)
def runFactorAnalyzer(self, cols_to_norm, result): fa = FactorAnalyzer(rotation="varimax", n_factors=2) df = result[cols_to_norm] result = result.dropna() df = df.dropna() fa.fit(df) ev = fa.get_eigenvalues() kmo_all, kmo_model = calculate_kmo(df) if (kmo_model < 0.6): print("kmo_model: %s " % kmo_model) array = fa.transform(df) #print("Factors: %s" % (array)) #print("loadings: %s " % fa.loadings_) #print("eigenvalues: %s " % ev[0]) dataframe = pd.DataFrame(columns=[ 'Player', 'Session', 'Time', 'NegativeEmotion', 'PositiveEmotion' ]) print("T session: %s " % len(result['Session'])) dataframe['Session'] = result['Session'] dataframe['Player'] = result['Player'] dataframe['Time'] = result['ts'] dataframe['NegativeEmotion'] = np.around(array[:, 0], 2) dataframe['PositiveEmotion'] = np.around(array[:, 1], 2) dataframe.to_csv('/home/elton/Desktop/Dataset/MetricsEmotion.csv', sep=',', mode='a', header=False)
def test_factor_variance(self): path = 'tests/data/test01.csv' data = pd.read_csv(path) fa = FactorAnalyzer() fa.analyze(data, 3, rotation=None) loadings = fa.loadings n_rows = loadings.shape[0] # calculate variance loadings = loadings**2 variance = loadings.sum(axis=0) # calculate proportional variance proportional_variance_expected = variance / n_rows proportional_variance = fa.get_factor_variance().loc['Proportion Var'] proportional_variance_expected.name = '' proportional_variance.name = '' assert_almost_equal(proportional_variance_expected, proportional_variance)
def test_smc_is_r_squared(self): # test that SMC is roughly equivalent to R-squared values. data = pd.DataFrame({ 'A': [10.5, 20.1, 30.2, 40.1, 50.3], 'B': [62, 71, 83, 91, 15], 'C': [0.45, 0.90, 0.22, 0.34, .045] }) expected_r2 = [0.478330, 0.196223, 0.484519] expected_r2 = pd.DataFrame(expected_r2, index=['A', 'B', 'C'], columns=['SMC']) smc_result = FactorAnalyzer.smc(data) assert_frame_equal(smc_result, expected_r2, check_less_precise=2)
def test_gridsearch(): # make sure this doesn't fail X = pd.DataFrame(np.random.randn(1000).reshape(100, 10)) y = pd.Series(np.random.choice([1, 0], size=100)) grid = { 'factoranalyzer__n_factors': [5, 7], 'factoranalyzer__rotation': [None, 'varimax'], 'decisiontreeclassifier__max_depth': [2, 5] } fa = FactorAnalyzer() decisiontree = DecisionTreeClassifier(random_state=123) pipe = make_pipeline(fa, decisiontree) gridsearch = GridSearchCV(pipe, grid, scoring='f1', cv=3, verbose=0) gridsearch.fit(X, y)
# the graph would show that 4 is acceptable if do_plot: sn.scatterplot(count_axis, pca.explained_variance_) plt.show() # ---------------------------------------------------------------------------- # Explore 2: 2D plot of all individuals using the 2D PCA vs 2D Factor analysis # ---------------------------------------------------------------------------- # PCA 2d pca = PCA(n_components=2) X_train_pca = pca.fit_transform(X_train_scaled) plotIn2D(X_train_pca, 'principal component', 1) # FA 2d fa = FactorAnalyzer(rotation=None, n_factors=2) fa.fit(X_train_scaled) X_train_fa = fa.transform(X_train_scaled) plotIn2D(X_train_fa, 'Factor analysis', 2) # the following instruction shows the 2 graphs in 2D PCA and FA. # we notice that the graphs are very similar in distribution of the individuals # The plan allows a clear separation of Benin from Malign plt.show() # X_test_fa = fa.transform(X_test_scaled) # print("X_train_fa") # print(X_train_fa) # Method 1: Logistic regression # -----------------------------
def main(): """ Run the script. """ # set up an argument parser parser = argparse.ArgumentParser(prog='factor_analyzer.py') parser.add_argument( dest='feature_file', help="Input file containing the pre-processed features " "for the training data") parser.add_argument( dest='output_dir', help="Output directory to save " "the output files", ) parser.add_argument('-f', '--factors', dest="num_factors", type=int, default=3, help="Number of factors to use (Default 3)", required=False) parser.add_argument('-r', '--rotation', dest="rotation", type=str, default='none', help="The rotation to perform (Default 'none')", required=False) parser.add_argument('-m', '--method', dest="method", type=str, default='minres', help="The method to use (Default 'minres')", required=False) # parse given command line arguments args = parser.parse_args() method = args.method factors = args.num_factors rotation = None if args.rotation == 'none' else args.rotation file_path = args.feature_file if not file_path.lower().endswith('.csv'): raise ValueError('The feature file must be in CSV format.') data = pd.read_csv(file_path) # get the logger logger = logging.getLogger(__name__) logging.setLevel(logging.INFO) # log some useful messages so that the user knows logger.info( "Starting exploratory factor analysis on: {}.".format(file_path)) # run the analysis analyzer = FactorAnalyzer() analyzer.analyze(data, factors, rotation, method) # create paths to loadings loadings, eigenvalues, communalities, variance path_loadings = os.path.join(args.output_dir, 'loadings.csv') path_eigen = os.path.join(args.output_dir, 'eigenvalues.csv') path_communalities = os.path.join(args.output_dir, 'communalities.csv') path_variance = os.path.join(args.output_dir, 'variance.csv') # retrieve loadings, eigenvalues, communalities, variance loadings = analyzer.loadings eigen, _ = analyzer.get_eigenvalues() communalities = analyzer.get_communalities() variance = analyzer.get_factor_variance() # save the files logger.info("Saving files...") loadings.to_csv(path_loadings) eigen.to_csv(path_eigen) communalities.to_csv(path_communalities) variance.to_csv(path_variance)
def get_factor_eigenvalues(df): fa = FactorAnalyzer(rotation=None) fa.fit(df) ev, v = fa.get_eigenvalues() return ev
def test_analyze_bad_svd_method(self): fa = FactorAnalyzer(svd_method='foo') fa.fit(np.random.randn(500).reshape(100, 5))
def test_analyze_rotation_value_error(self): fa = FactorAnalyzer(rotation='blah', n_factors=1) fa.fit(np.random.randn(500).reshape(100, 5))