Example #1
0
    def test_abdi_valentin(self):
        # Data taken from http://www.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
        # Multiple Correspondence Analysis
        # (Hervé Abdi & Dominique Valentin, 2007)
        # See in particular Table 2,3,4.

        # first we check the eigenvalues and factor scores with Benzecri
        # correction
        df = pandas.read_table('data/burgundies.csv', skiprows=1, sep=',',
                               index_col=0)
        mca_df = MCA(df.drop('oak_type', axis=1), ncols=10)
        assert_allclose([0.7004, 0.0123, 0.0003], mca_df.E[:3], atol=1e-4)
        true_fs_row = [[0.86, 0.08], [-0.71, -0.16], [-0.92, 0.08],
                       [-0.86, 0.08], [0.92, 0.08], [0.71, -0.16]]
        assert_allclose(true_fs_row, mca_df.fs_r(N=2), atol=1e-2)
        true_fs_col = [[.90, -.90, -.97, .00, .97, -.90, .90, .90, -.90, -.9,
                        .90, -.97, .00, .97, -.90, .90, .28, -.28, -.90, .90,
                        -.90, .9, .90, -.90],
                       [.00, .00, .18, -.35, .18, .00, .00, .00, .0, .00, .00,
                        .18, -.35, .18, .00, .00, .0, .00, .00, .00, .00, .00,
                        .00, .00]]
        assert_allclose(array(true_fs_col).T[:-2], mca_df.fs_c(N=2), atol=1e-2)

        true_cont_r = [[177, 121, 202, 177, 202, 121],
                       [83, 333, 83, 83, 83, 333]]
        assert_allclose(true_cont_r, 1000*mca_df.cont_r(N=2).T, atol=1)

        true_cont_c = [[58, 58, 44, 0, 44, 58, 58, 58, 58, 58, 58, 44, 0, 44,
                        58, 58, 6, 6, 58, 58, 58, 58],
                       [0, 0, 83, 333, 83, 0, 0, 0, 0, 0, 0, 83, 333, 83, 0,
                        0, 0, 0, 0, 0, 0, 0]]
        assert_allclose(true_cont_c, 1000*mca_df.cont_c(N=2).T, atol=1)

        # I declined to include a test for the cos_c and cos_r functions because
        # I think the source itself is mistaken. In Abdi-MCA2007-pretty.pdf as in
        # elsewhere the formula for the squared cosine is f**2/d**2. This does not
        # agree with tables 3 and 4. In table 3 the squared cosine is derived from
        # f**2/I where I = 1.2 is the inertia before Benzecri correction. I have no
        # idea how the squared cosines in table 4 were derived. My formula, however
        # does comport with the figures given in (Abdi & Bera, 2014), tested next.

        # oak = pandas.DataFrame([1,2,2,2,1,1], columns=['oak_type'])
        # print(dummy(oak))
        # mca_df.fs_c_sup(dummy(oak))

        # ... then without Benzecri correction
        mca_df_i = MCA(df.drop('oak_type', axis=1), ncols=10, benzecri=False)
        assert_allclose([0.8532, 0.2, 0.1151, 0.0317],
                        (mca_df_i.s**2)[:4], atol=1e-4)

        # check percentage of explained variance both with and without Benzecri
        # and Greenacre corrections
        true_expl_var_i = [.7110, .1667, .0959, .0264, 0., 0.]
        true_expl_var_z = [.9823, .0173, .0004, 0., 0., 0.]
        true_expl_var_c = [.9519, .0168, .0004, 0., 0., 0.]
        assert_allclose(mca_df_i.expl_var(False), true_expl_var_i, atol=1e-4)
        assert_allclose(mca_df_i.expl_var(), true_expl_var_c, atol=1e-4)
        assert_allclose(mca_df.expl_var(False), true_expl_var_z, atol=1e-4)
        assert_allclose(mca_df.expl_var(), true_expl_var_c, atol=1e-4)
Example #2
0
    def test_abdi_williams(self):
        # Data taken from www.utdallas.edu/~herve/abdi-CorrespondenceAnaysis2010-pretty.pdf
        # Correspondence Analysis, (Herve Abdi & Michel Bera, 2010)
        # SAGE Encyclopedia of Research Design. Table 4, page 16.

        df = pandas.read_table('data/french_writers.csv',
                               skiprows=0,
                               index_col=0,
                               sep=',')
        mca_df = MCA(df, benzecri=False)

        assert_allclose(mca_df.c, [.2973, .5642, .1385], atol=1e-4)
        assert_allclose(mca_df.r, [.0189, .1393, .2522, .3966, .1094, .0835],
                        atol=1e-4)

        true_fs_row = [[0.2398, 0.1895, 0.1033, -0.0918, -0.2243, 0.0475],
                       [0.0741, 0.1071, -0.0297, 0.0017, 0.0631, -0.1963]]
        assert_allclose(mca_df.fs_r(N=2).T, true_fs_row, atol=1e-4)
        assert_allclose(mca_df.L, [.0178, .0056], atol=1e-4)

        assert_allclose(
            -mca_df.fs_c(N=2).T,
            [[-0.0489, 0.0973, -0.2914], [.1115, -0.0367, -0.0901]],
            atol=1e-4)

        true_cont_r = [[0.0611, 0.2807, 0.1511, 0.1876, 0.3089, 0.0106],
                       [0.0186, 0.2864, 0.0399, 0.0002, 0.0781, 0.5767]]
        assert_allclose(mca_df.cont_r(N=2).T, true_cont_r, atol=1e-4)

        true_cos_r = [[0.9128, 0.7579, 0.9236, 0.9997, 0.9266, 0.0554],
                      [0.0872, 0.2421, 0.0764, 0.0003, 0.0734, 0.9446]]
        assert_allclose(mca_df.cos_r(N=2).T, true_cos_r, atol=1e-4)

        assert_allclose(mca_df.cont_c(N=2).T,
                        [[0.0399, 0.2999, 0.6601], [0.6628, 0.1359, 0.2014]],
                        atol=1e-4)
        assert_allclose(mca_df.cos_c(N=2).T,
                        [[0.1614, 0.8758, 0.9128], [0.8386, 0.1242, 0.0872]],
                        atol=1e-4)

        assert_allclose(mca_df.dc, [0.0148, 0.0108, 0.0930], atol=1e-4)
        assert_allclose(mca_df.dr,
                        [0.0630, 0.0474, 0.0116, 0.0084, 0.0543, 0.0408],
                        atol=1e-4)

        # abdi = numpy.array([216, 139, 26]) #
        abdi = pandas.DataFrame([216, 139, 26]).T
        assert_allclose(mca_df.fs_r_sup(abdi, 2), [[-0.0908, 0.5852]],
                        atol=1e-4)

        supp = pandas.read_table('data/french_writers_supp.csv',
                                 skiprows=0,
                                 index_col=0,
                                 sep=',')

        true_fs_col_sup = [[-0.0596, -0.1991, -0.4695, -0.4008],
                           [0.2318, 0.2082, -0.2976, -0.4740]]
        assert_allclose(mca_df.fs_c_sup(supp).T, true_fs_col_sup, atol=1e-3)
Example #3
0
	def test_sparse(self):
		df = DataFrame(randint(0,2,(100,100)))
		mca1 = MCA(df, sparse=False)
		mca2 = MCA(df, sparse=True)
		assert_allclose(mca1.s[:-1], mca2.s, atol=1e-12)
		for row1, row2 in zip(mca1.P.T, mca2.P.T):
		 	assert_allclose(row1, row2 if sign(row1[0])*sign(row2[0]) > 0 else -row2, atol=1e-12) 
		for row1, row2 in zip(mca1.Q, mca2.Q):
		 	assert_allclose(row1, row2 if sign(row1[0])*sign(row2[0]) > 0 else -row2, atol=1e-12) 
Example #4
0
    def test_abdi_williams(self):
        # Data taken from www.utdallas.edu/~herve/abdi-CorrespondenceAnaysis2010-pretty.pdf
        # Correspondence Analysis, (Herve Abdi & Michel Bera, 2010)
        # SAGE Encyclopedia of Research Design. Table 4, page 16.

        df = pandas.read_table('data/french_writers.csv', skiprows=0,
                               index_col=0, sep=',')
        mca_df = MCA(df, benzecri=False)

        assert_allclose(mca_df.c, [.2973, .5642, .1385], atol=1e-4)
        assert_allclose(mca_df.r, [.0189, .1393, .2522, .3966, .1094, .0835],
                        atol=1e-4)

        true_fs_row = [[0.2398, 0.1895, 0.1033, -0.0918, -0.2243, 0.0475],
                       [0.0741, 0.1071, -0.0297, 0.0017, 0.0631, -0.1963]]
        assert_allclose(mca_df.fs_r(N=2).T, true_fs_row, atol=1e-4)
        assert_allclose(mca_df.L, [.0178, .0056], atol=1e-4)

        assert_allclose(-mca_df.fs_c(N=2).T, [[-0.0489, 0.0973, -0.2914],
                                              [.1115, -0.0367, -0.0901]],
                        atol=1e-4)

        true_cont_r = [[0.0611, 0.2807, 0.1511, 0.1876, 0.3089, 0.0106],
                       [0.0186, 0.2864, 0.0399, 0.0002, 0.0781, 0.5767]]
        assert_allclose(mca_df.cont_r(N=2).T, true_cont_r, atol=1e-4)

        true_cos_r = [[0.9128, 0.7579, 0.9236, 0.9997, 0.9266, 0.0554],
                      [0.0872, 0.2421, 0.0764, 0.0003, 0.0734, 0.9446]]
        assert_allclose(mca_df.cos_r(N=2).T, true_cos_r, atol=1e-4)

        assert_allclose(mca_df.cont_c(N=2).T, [[0.0399, 0.2999, 0.6601],
                                               [0.6628, 0.1359, 0.2014]],
                        atol=1e-4)
        assert_allclose(mca_df.cos_c(N=2).T, [[0.1614, 0.8758, 0.9128],
                                              [0.8386, 0.1242, 0.0872]],
                        atol=1e-4)

        assert_allclose(mca_df.dc, [0.0148, 0.0108, 0.0930], atol=1e-4)
        assert_allclose(mca_df.dr,
                        [0.0630, 0.0474, 0.0116, 0.0084, 0.0543, 0.0408],
                        atol=1e-4)

        # abdi = numpy.array([216, 139, 26]) #
        abdi = pandas.DataFrame([216, 139, 26]).T
        assert_allclose(mca_df.fs_r_sup(abdi, 2), [[-0.0908, 0.5852]],
                        atol=1e-4)

        supp = pandas.read_table('data/french_writers_supp.csv', skiprows=0,
                                 index_col=0, sep=',')

        true_fs_col_sup = [[-0.0596, -0.1991, -0.4695, -0.4008],
                           [0.2318, 0.2082, -0.2976, -0.4740]]
        assert_allclose(mca_df.fs_c_sup(supp).T, true_fs_col_sup, atol=1e-3)
Example #5
0
    def test_abdi_bera(self):
        # Data taken from www.utdallas.edu/~herve/abdi-AB2014_CA.pdf
        # Correspondence Analysis, (Herve Abdi & Michel Bera, 2014)
        # Springer Encyclopedia of Social Networks and Mining.
        df = pandas.read_table('data/music_color.csv',
                               skiprows=0,
                               index_col=0,
                               sep=',')
        mca_df = MCA(df, benzecri=False)

        # Table 1, page 13
        assert_allclose(
            mca_df.r,
            [.121, .091, .126, .116, .096, .066, .071, .146, .061, .106],
            atol=1e-3)
        assert_allclose(mca_df.c,
                        [.11, .11, .11, .11, .11, .11, .11, .11, .11],
                        atol=1e-2)

        # Table 2, page 14
        assert_allclose(
            mca_df.fs_r(N=2),
            [[-0.026, 0.299], [-0.314, 0.232], [-0.348, 0.202],
             [-0.044, -0.490], [-0.082, -0.206], [-0.619, 0.475],
             [-0.328, 0.057], [1.195, 0.315], [-0.57, 0.3], [0.113, -0.997]],
            atol=1e-3)
        assert_allclose(mca_df.cont_r(N=2) * 1000,
                        [[0, 56], [31, 25], [53, 27], [1, 144], [2, 21],
                         [87, 77], [26, 1], [726, 75], [68, 28], [5, 545]],
                        atol=1)
        assert_allclose(
            mca_df.cos_r(N=2) * 1000,
            [[3, 410], [295, 161], [267, 89], [5, 583], [13, 81], [505, 298],
             [77, 2], [929, 65], [371, 103], [12, 973]],
            atol=1)

        # Table 3, page 17
        assert_allclose(mca_df.fs_c(N=2),
                        [[-0.541, 0.386], [-.257, .275], [-.291, -.309],
                         [.991, .397], [-.122, -.637], [-.236, .326],
                         [.954, -.089], [-.427, .408], [-.072, -.757]],
                        atol=1e-3)
        assert_allclose(mca_df.cont_c(N=2) * 1000,
                        [[113, 86], [25, 44], [33, 55], [379, 91], [6, 234],
                         [22, 61], [351, 5], [70, 96], [2, 330]],
                        atol=1)
        assert_allclose(mca_df.cos_c(N=2) * 1000,
                        [[454, 232], [105, 121], [142, 161], [822, 132],
                         [26, 709], [78, 149], [962, 8], [271, 249], [7, 759]],
                        atol=1)

        assert_allclose(mca_df.L[:2], [.287, .192], atol=2e-3)
        self.assertAlmostEqual(mca_df.inertia, 0.746, 3)
Example #6
0
    def test_abdi_bera(self):
        # Data taken from www.utdallas.edu/~herve/abdi-AB2014_CA.pdf
        # Correspondence Analysis, (Herve Abdi & Michel Bera, 2014)
        # Springer Encyclopedia of Social Networks and Mining.
        df = pandas.read_table('data/music_color.csv', skiprows=0, index_col=0,
                               sep=',')
        mca_df = MCA(df, benzecri=False)

        # Table 1, page 13
        assert_allclose(mca_df.r, [.121, .091, .126, .116, .096, .066, .071,
                                   .146, .061, .106], atol=1e-3)
        assert_allclose(mca_df.c, [.11, .11, .11, .11, .11, .11, .11, .11, .11],
                        atol=1e-2)

        # Table 2, page 14
        assert_allclose(mca_df.fs_r(N=2), [[-0.026, 0.299], [-0.314, 0.232],
                                           [-0.348, 0.202], [-0.044, -0.490],
                                           [-0.082, -0.206], [-0.619, 0.475],
                                           [-0.328, 0.057], [1.195, 0.315],
                                           [-0.57, 0.3], [0.113, -0.997]],
                        atol=1e-3)
        assert_allclose(mca_df.cont_r(N=2)*1000, [[0, 56], [31, 25], [53, 27],
                                                  [1, 144], [2, 21], [87, 77],
                                                  [26, 1], [726, 75], [68, 28],
                                                  [5, 545]],
                        atol=1)
        assert_allclose(mca_df.cos_r(N=2)*1000,
                        [[3, 410], [295, 161], [267, 89], [5, 583], [13, 81],
                         [505, 298], [77, 2], [929, 65], [371, 103], [12, 973]],
                        atol=1)

        # Table 3, page 17
        assert_allclose(mca_df.fs_c(N=2),
                        [[-0.541, 0.386], [-.257, .275], [-.291, -.309],
                         [.991, .397], [-.122, -.637], [-.236, .326],
                         [.954, -.089], [-.427, .408], [-.072, -.757]],
                         atol=1e-3)
        assert_allclose(mca_df.cont_c(N=2)*1000,
                        [[113, 86], [25, 44], [33, 55],
                         [379, 91], [6, 234], [22, 61],
                         [351, 5], [70, 96], [2, 330]],
                        atol=1)
        assert_allclose(mca_df.cos_c(N=2)*1000,
                        [[454, 232], [105, 121], [142, 161], [822, 132],
                         [26, 709], [78, 149], [962, 8], [271, 249], [7, 759]],
                        atol=1)

        assert_allclose(mca_df.L[:2], [.287, .192], atol=2e-3)
        self.assertAlmostEqual(mca_df.inertia, 0.746, 3)
Example #7
0
plt.figure(figsize=(8, 8))
for color, i, target_name in zip(colors, [0,1,2,3,4], [1,2,3,4,5]):
    plt.scatter(X_pca[yyy == i, 0], X_pca[yyy == i, 1], color=color, lw=0.1, label=target_name)

plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.show()

X_clust = X_pca
#endregion



# %% region[red] M COMPONENT ANALYSIS
mca = MCA(XXX_cat)
plt.plot(mca.expl_var(N=20), color="PaleVioletRed", label="Explained Variance Ratio", linewidth=3.0)
plt.xlabel("Number of components")
plt.ylabel("Explained Variance Ratio")
plt.legend()
plt.show()

# X_mca = mca.fs_r(N=129)
X_mca = mca.fs_r(N=20)
X_clust = np.concatenate((np.array(X_pca),np.array(X_mca)),axis=1)
#endregion



# %% region[orange] INDEPENDANT COMPONENT ANALYSIS
ica = FastICA(n_components=645)
Example #8
0
	def test_abdi_valentin(self):
		# Data taken from http://www.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
		# Multiple Correspondence Analysis
		# (Hervé Abdi & Dominique Valentin, 2007)
		# See in particular Table 2,3,4.

		# first we check the eigenvalues and factor scores with Benzecri
		# correction
		df = read_table('data/burgundies.csv', skiprows=1, sep=',', index_col=0)
		mca_df = MCA(df.drop('oak_type', axis=1), ncols=10)
		assert_allclose([0.7004, 0.0123, 0.0003], mca_df.E[:3], atol=1e-4)
		true_fs_row = [[0.86, 0.08], [-0.71, -0.16], [-0.92, 0.08],
					   [-0.86, 0.08], [0.92, 0.08], [0.71, -0.16]]
		assert_allclose(true_fs_row, mca_df.fs_r(N=2), atol=1e-2)
		true_fs_col = [[.90, -.90, -.97, .00, .97, -.90, .90, .90, -.90, -.9,
						.90, -.97, .00, .97, -.90, .90, .28, -.28, -.90, .90,
						-.90, .9, .90, -.90],
					   [.00, .00, .18, -.35, .18, .00, .00, .00, .0, .00, .00,
						.18, -.35, .18, .00, .00, .0, .00, .00, .00, .00, .00,
						.00, .00]]
		assert_allclose(array(true_fs_col).T[:-2], mca_df.fs_c(N=2), atol=1e-2)

		true_cont_r = [[177, 121, 202, 177, 202, 121],
					   [83, 333, 83, 83, 83, 333]]
		assert_allclose(true_cont_r, 1000*mca_df.cont_r(N=2).T, atol=1)

		true_cont_c = [[58, 58, 44, 0, 44, 58, 58, 58, 58, 58, 58, 44, 0, 44,
						58, 58, 6, 6, 58, 58, 58, 58],
					   [0, 0, 83, 333, 83, 0, 0, 0, 0, 0, 0, 83, 333, 83, 0,
						0, 0, 0, 0, 0, 0, 0]]
		assert_allclose(true_cont_c, 1000*mca_df.cont_c(N=2).T, atol=1)

		# I declined to include a test for the cos_c and cos_r functions because
		# I think the source itself is mistaken. In Abdi-MCA2007-pretty.pdf as in
		# elsewhere the formula for the squared cosine is f**2/d**2. This does not
		# agree with tables 3 and 4. In table 3 the squared cosine is derived from
		# f**2/I where I = 1.2 is the inertia before Benzecri correction. I have no
		# idea how the squared cosines in table 4 were derived. My formula, however
		# does comport with the figures given in (Abdi & Bera, 2014), tested next.

		# oak = DataFrame([1,2,2,2,1,1], columns=['oak_type'])
		# print(dummy(oak))
		# mca_df.fs_c_sup(dummy(oak))

		# ... then without Benzecri correction
		mca_df_i = MCA(df.drop('oak_type', axis=1), ncols=10, benzecri=False)
		assert_allclose([0.8532, 0.2, 0.1151, 0.0317],
						(mca_df_i.s**2)[:4], atol=1e-4)

		# check percentage of explained variance both with and without Benzecri
		# and Greenacre corrections
		true_expl_var_i = [.7110, .1667, .0959, .0264, 0., 0.]
		true_expl_var_z = [.9823, .0173, .0004, 0., 0., 0.]
		true_expl_var_c = [.9519, .0168, .0004, 0., 0., 0.]
		assert_allclose(mca_df_i.expl_var(False), true_expl_var_i, atol=1e-4)
		assert_allclose(mca_df_i.expl_var(), true_expl_var_c, atol=1e-4)
		assert_allclose(mca_df.expl_var(False), true_expl_var_z, atol=1e-4)
		assert_allclose(mca_df.expl_var(), true_expl_var_c, atol=1e-4)
Example #9
0
def correspondence_analysis(edges, n=1):
    """
    Performs correspondence analysis on a set of features.

    Most useful in the context of network analysis, where you might wish to, for example, \
    identify the underlying dimension in a network of Twitter users by using a matrix representing whether \
    or not they follow one another (when news and political accounts are included, the \
    underlying dimension often appears to approximate the left-right political spectrum.)

    :param edges: A :py:class:`pandas.DataFrame` of NxN where both the rows and columns are "nodes" and the values \
    are some sort of closeness or similarity measure (like a cosine similarity matrix)
    :param n: The number of dimensions to extract
    :type n: int
    :return: A :py:class:`pandas.DataFrame` where rows are the units and the columns correspond to the extracted \
    dimensions.

    Usage::

        from pewanalytics.stats.dimensionality_reduction import correspondence_analysis
        import nltk
        import pandas as pd
        from sklearn.metrics.pairwise import linear_kernel
        from sklearn.feature_extraction.text import TfidfVectorizer

        nltk.download("inaugural")
        df = pd.DataFrame([
            {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids()
        ])

        vec = TfidfVectorizer(min_df=10, max_df=.9).fit(df['text'])
        tfidf = vec.transform(df['text'])

        cosine_similarities = linear_kernel(tfidf)
        matrix = pd.DataFrame(cosine_similarities, columns=df['speech'])

        # Looks like the main source of variation in the language of inaugural speeches is time!

        >>> mca = correspondence_analysis(matrix)

        >>> mca.sort_values("mca_1").head()
                        node     mca_1
        57  1993-Clinton.txt -0.075508
        56    2017-Trump.txt -0.068168
        55  1997-Clinton.txt -0.061567
        54    1973-Nixon.txt -0.060698
        53     1989-Bush.txt -0.056305

        >>> mca.sort_values("mca_1").tail()
                       node     mca_1
        4    1877-Hayes.txt  0.040037
        3   1817-Monroe.txt  0.040540
        2     1845-Polk.txt  0.042847
        1   1849-Taylor.txt  0.050937
        0  1829-Jackson.txt  0.056201


    """

    mca_counts = MCA(edges)
    rows = []
    for r in sorted(
            zip(edges.columns, [m for m in mca_counts.fs_r(N=n)]),
            key=lambda x: x[1][0],
            reverse=True,
    ):
        row = {"node": r[0]}
        for i in range(n):
            try:
                row["mca_{}".format(i + 1)] = r[1][i]
            except:
                pass
        rows.append(row)
    mca = pd.DataFrame(rows)

    return mca