Example #1
0
    def test_analyze_rotation_value_error(self):

        data = pd.DataFrame({
            'A': [2, 4, 5, 6, 8, 9],
            'B': [4, 8, np.nan, 10, 16, 18],
            'C': [6, 12, 15, 12, 26, 27]
        })

        fa = FactorAnalyzer(rotation='blah', n_factors=1)
        fa.fit(data)
Example #2
0
    def test_analyze_infinite(self):

        data = pd.DataFrame({
            'A': [2, 4, 5, 6, 8, 9],
            'B': [4, 8, float('inf'), 10, 16, 18],
            'C': [6, 12, 15, 12, 26, 27]
        })

        fa = FactorAnalyzer()
        fa.analyze(data, 1, impute='drop')
Example #3
0
    def test_analyze_impute_value_error(self):

        data = pd.DataFrame({
            'A': [2, 4, 5, 6, 8, 9],
            'B': [4, 8, np.nan, 10, 16, 18],
            'C': [6, 12, 15, 12, 26, 27]
        })

        fa = FactorAnalyzer()
        fa.analyze(data, 1, rotation=None, impute='blah')
Example #4
0
    def test_analyze_infinite(self):

        data = pd.DataFrame(
            {
                'A': [1.0, 0.4, 0.5],
                'B': [0.4, 1.0, float('inf')],
                'C': [0.5, float('inf'), 1.0]
            },
            index=['A', 'B', 'C'])

        fa = FactorAnalyzer(impute='drop', n_factors=1, is_corr_matrix=True)
        fa.fit(data)
Example #5
0
    def test_analyze_impute_drop(self):

        data = pd.DataFrame({
            'A': [2, 4, 5, 6, 8, 9],
            'B': [4, 8, np.nan, 10, 16, 18],
            'C': [6, 12, 15, 12, 26, 27]
        })

        expected = data.copy()
        expected = expected.dropna()
        expected_corr = expected.corr()

        fa = FactorAnalyzer()
        fa.analyze(data, 1, rotation=None, impute='drop')
        assert_frame_equal(fa.corr, expected_corr)
Example #6
0
    def test_analyze_weights(self):

        data = pd.DataFrame({
            'A': [2, 4, 5, 6, 8, 9],
            'B': [4, 8, 9, 10, 16, 18],
            'C': [6, 12, 15, 12, 26, 27]
        })

        fa = FactorAnalyzer(rotation=None)
        fa.fit(data)
        _ = fa.transform(data)
        expected_weights = np.array(([[0.33536334, -2.72509646, 0],
                                      [0.33916605, -0.29388849, 0],
                                      [0.33444588, 3.03060826, 0]]))
        assert_array_almost_equal(expected_weights, fa.weights_)
Example #7
0
    def test_analyze_impute_drop(self):

        data = pd.DataFrame({
            'A': [2, 4, 5, 6, 8, 9],
            'B': [4, 8, np.nan, 10, 16, 18],
            'C': [6, 12, 15, 12, 26, 27]
        })

        expected = data.copy()
        expected = expected.dropna()
        expected_corr = expected.corr()
        expected_corr = expected_corr.values

        fa = FactorAnalyzer(rotation=None, impute='drop', n_factors=1)
        fa.fit(data)
        assert_array_almost_equal(fa.corr_, expected_corr)
Example #8
0
    def test_remove_all_columns(self):
        # test that columns with string values are removed.

        data = pd.DataFrame({'A': ['1', 2, 3, 4, 5], 'B': [6, 7, 8, 9, '10']})

        result = FactorAnalyzer().remove_non_numeric(data)

        assert result.empty
Example #9
0
    def test_remove_one_column(self):
        # test that only column with string is removed.

        data = pd.DataFrame({'A': ['1', 2, 3, 4, 5], 'B': [6, 7, 8, 9, 10]})

        expected = pd.DataFrame({'B': [6, 7, 8, 9, 10]})

        result = FactorAnalyzer().remove_non_numeric(data)
        assert_frame_equal(expected, result)
Example #10
0
    def test_remove_no_columns(self):
        # test that no numeric columns are removed.

        data = pd.DataFrame({
            'A': [1, 2, 3, 4, 5],
            'B': [6.1, 7.2, 8.4, 9.2, 10.1]
        })

        result = FactorAnalyzer().remove_non_numeric(data)

        assert_frame_equal(data, result)
Example #11
0
    def test_factor_variance(self):

        path = 'tests/data/test01.csv'
        data = pd.read_csv(path)

        fa = FactorAnalyzer(n_factors=3, rotation=None)
        fa.fit(data)
        loadings = fa.loadings_

        n_rows = loadings.shape[0]

        # calculate variance
        loadings = loadings**2
        variance = np.sum(loadings, axis=0)

        # calculate proportional variance
        proportional_variance_expected = variance / n_rows
        proportional_variance = fa.get_factor_variance()[1]

        assert_almost_equal(proportional_variance_expected,
                            proportional_variance)
Example #12
0
    def runFactorAnalyzer(self, cols_to_norm, result):
        fa = FactorAnalyzer(rotation="varimax", n_factors=2)
        df = result[cols_to_norm]
        result = result.dropna()
        df = df.dropna()

        fa.fit(df)
        ev = fa.get_eigenvalues()
        kmo_all, kmo_model = calculate_kmo(df)

        if (kmo_model < 0.6):
            print("kmo_model: %s " % kmo_model)
        array = fa.transform(df)
        #print("Factors: %s" % (array))
        #print("loadings: %s " % fa.loadings_)

        #print("eigenvalues: %s " % ev[0])
        dataframe = pd.DataFrame(columns=[
            'Player', 'Session', 'Time', 'NegativeEmotion', 'PositiveEmotion'
        ])
        print("T session: %s " % len(result['Session']))
        dataframe['Session'] = result['Session']
        dataframe['Player'] = result['Player']
        dataframe['Time'] = result['ts']
        dataframe['NegativeEmotion'] = np.around(array[:, 0], 2)
        dataframe['PositiveEmotion'] = np.around(array[:, 1], 2)
        dataframe.to_csv('/home/elton/Desktop/Dataset/MetricsEmotion.csv',
                         sep=',',
                         mode='a',
                         header=False)
Example #13
0
    def test_factor_variance(self):

        path = 'tests/data/test01.csv'
        data = pd.read_csv(path)

        fa = FactorAnalyzer()
        fa.analyze(data, 3, rotation=None)
        loadings = fa.loadings

        n_rows = loadings.shape[0]

        # calculate variance
        loadings = loadings**2
        variance = loadings.sum(axis=0)

        # calculate proportional variance
        proportional_variance_expected = variance / n_rows
        proportional_variance = fa.get_factor_variance().loc['Proportion Var']

        proportional_variance_expected.name = ''
        proportional_variance.name = ''

        assert_almost_equal(proportional_variance_expected,
                            proportional_variance)
Example #14
0
    def test_smc_is_r_squared(self):
        # test that SMC is roughly equivalent to R-squared values.

        data = pd.DataFrame({
            'A': [10.5, 20.1, 30.2, 40.1, 50.3],
            'B': [62, 71, 83, 91, 15],
            'C': [0.45, 0.90, 0.22, 0.34, .045]
        })

        expected_r2 = [0.478330, 0.196223, 0.484519]
        expected_r2 = pd.DataFrame(expected_r2,
                                   index=['A', 'B', 'C'],
                                   columns=['SMC'])

        smc_result = FactorAnalyzer.smc(data)

        assert_frame_equal(smc_result, expected_r2, check_less_precise=2)
Example #15
0
def test_gridsearch():
    # make sure this doesn't fail

    X = pd.DataFrame(np.random.randn(1000).reshape(100, 10))
    y = pd.Series(np.random.choice([1, 0], size=100))

    grid = {
        'factoranalyzer__n_factors': [5, 7],
        'factoranalyzer__rotation': [None, 'varimax'],
        'decisiontreeclassifier__max_depth': [2, 5]
    }

    fa = FactorAnalyzer()
    decisiontree = DecisionTreeClassifier(random_state=123)
    pipe = make_pipeline(fa, decisiontree)

    gridsearch = GridSearchCV(pipe, grid, scoring='f1', cv=3, verbose=0)
    gridsearch.fit(X, y)
# the graph would show that 4 is acceptable
if do_plot:
    sn.scatterplot(count_axis, pca.explained_variance_)
    plt.show()

# ----------------------------------------------------------------------------
# Explore 2: 2D plot of all individuals using the 2D PCA vs 2D Factor analysis
# ----------------------------------------------------------------------------

# PCA 2d
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
plotIn2D(X_train_pca, 'principal component', 1)

# FA 2d
fa = FactorAnalyzer(rotation=None, n_factors=2)
fa.fit(X_train_scaled)
X_train_fa = fa.transform(X_train_scaled)
plotIn2D(X_train_fa, 'Factor analysis', 2)

# the following instruction shows the 2 graphs in 2D PCA and FA.
# we notice that the graphs are very similar in distribution of the individuals
# The plan allows a clear separation of Benin from Malign
plt.show()

# X_test_fa = fa.transform(X_test_scaled)
# print("X_train_fa")
# print(X_train_fa)

# Method 1: Logistic regression
# -----------------------------
Example #17
0
def main():
    """ Run the script.
    """

    # set up an argument parser
    parser = argparse.ArgumentParser(prog='factor_analyzer.py')
    parser.add_argument(
        dest='feature_file',
        help="Input file containing the pre-processed features "
        "for the training data")
    parser.add_argument(
        dest='output_dir',
        help="Output directory to save "
        "the output files",
    )
    parser.add_argument('-f',
                        '--factors',
                        dest="num_factors",
                        type=int,
                        default=3,
                        help="Number of factors to use (Default 3)",
                        required=False)

    parser.add_argument('-r',
                        '--rotation',
                        dest="rotation",
                        type=str,
                        default='none',
                        help="The rotation to perform (Default 'none')",
                        required=False)

    parser.add_argument('-m',
                        '--method',
                        dest="method",
                        type=str,
                        default='minres',
                        help="The method to use (Default 'minres')",
                        required=False)

    # parse given command line arguments
    args = parser.parse_args()

    method = args.method
    factors = args.num_factors
    rotation = None if args.rotation == 'none' else args.rotation

    file_path = args.feature_file

    if not file_path.lower().endswith('.csv'):
        raise ValueError('The feature file must be in CSV format.')

    data = pd.read_csv(file_path)

    # get the logger
    logger = logging.getLogger(__name__)
    logging.setLevel(logging.INFO)

    # log some useful messages so that the user knows
    logger.info(
        "Starting exploratory factor analysis on: {}.".format(file_path))

    # run the analysis
    analyzer = FactorAnalyzer()
    analyzer.analyze(data, factors, rotation, method)

    # create paths to loadings loadings, eigenvalues, communalities, variance
    path_loadings = os.path.join(args.output_dir, 'loadings.csv')
    path_eigen = os.path.join(args.output_dir, 'eigenvalues.csv')
    path_communalities = os.path.join(args.output_dir, 'communalities.csv')
    path_variance = os.path.join(args.output_dir, 'variance.csv')

    # retrieve loadings, eigenvalues, communalities, variance
    loadings = analyzer.loadings
    eigen, _ = analyzer.get_eigenvalues()
    communalities = analyzer.get_communalities()
    variance = analyzer.get_factor_variance()

    # save the files
    logger.info("Saving files...")
    loadings.to_csv(path_loadings)
    eigen.to_csv(path_eigen)
    communalities.to_csv(path_communalities)
    variance.to_csv(path_variance)
Example #18
0
def get_factor_eigenvalues(df):
    fa = FactorAnalyzer(rotation=None)
    fa.fit(df)
    ev, v = fa.get_eigenvalues()
    return ev
Example #19
0
 def test_analyze_bad_svd_method(self):
     fa = FactorAnalyzer(svd_method='foo')
     fa.fit(np.random.randn(500).reshape(100, 5))
Example #20
0
 def test_analyze_rotation_value_error(self):
     fa = FactorAnalyzer(rotation='blah', n_factors=1)
     fa.fit(np.random.randn(500).reshape(100, 5))