Ejemplo n.º 1
0
    def __init__(self, X, threshold):  # assume receiving a DataFrame
        columns = X.columns[
            1:]  # exclude the first column as it contains the observations' names
        index = X.index
        self.X = X[columns].values
        self.pca = pca.PCA(self.X)
        PC = self.pca.getPrincipalComponents()
        a = self.pca.getEigenVectors()
        alpha = self.pca.getEigenValues()
        correl = self.pca.getCorrelation()

        self.scores, q, beta, communalities = pp.evaluate(PC, correl, alpha)
        self.Bartlett_test = fa.calculate_bartlett_sphericity(
            pd.DataFrame(self.X, index=index,
                         columns=columns))  # compute Bartlett test
        self.KMO_test = fa.calculate_kmo(
            pd.DataFrame(self.X, index=index, columns=columns)
        )  # compute Kaiser, Meyer, Olkin test as a measure of sampling adequacy
        if self.KMO_test[1] < float(threshold):
            print("No significant factor found!")
            exit(1)

        self.fa = fa.FactorAnalyzer()
        self.fa.analyze(pd.DataFrame(self.X, index=index, columns=columns),
                        rotation=None)
        self.loadings = self.fa.loadings

        self.fa.analyze(pd.DataFrame(self.X, index=index, columns=columns),
                        rotation='quartimax')
        self.rotatedLoadings = self.fa.loadings

        self.eigenValues = self.fa.get_eigenvalues()
Ejemplo n.º 2
0
    def data_suitable(df, kmo_value = False, ignore = False):
        
        #Test to ensure data is not identity Matrix
        chi_square_value, p_value = calculate_bartlett_sphericity(df)
        
        # test to ensure that observed data is adquite for FA. Must be > 0.6
        kmo_all, kmo_model = calculate_kmo(df)

        if (p_value > 0.1 or kmo_model < 0.6) and ignore != True:
            raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}.  KMO model Score: {}".format(p_value, kmo_model))
        
        if kmo_value:
            return kmo_model
        else:
            return
Ejemplo n.º 3
0
 def kmo(self, threshold=0.5):
     self.kmo_value = fa.calculate_kmo(self.t)
     if self.kmo_value[1] < threshold:
         self.message = "There is no any significant factor!"
     return self
Ejemplo n.º 4
0
    att_data.dropna(inplace=True)

    swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2',
        '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5'
    ]
    for i in att_data.index:
        for col in swapList:
            swapOrdering(att_data, i, semester + col)

    # KMO and Barlett tests
    X = att_data.copy().values
    X = check_array(X, force_all_finite='allow-nan')

    statistic, p_value = calculate_bartlett_sphericity(X)
    #print("\nBarlett sphericity p={0}".format(p_value))
    kmo_per_variable, kmo_total = calculate_kmo(X)
    #print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total))

    # Create factor analysis object and perform factor analysis
    # Using maximum likelihood analysis (ml)
    n_factors = 5
    fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml")
    fa.fit(att_data)

    # Kaiser normalization and oblimin rotation
    rotator = Rotator(method="oblimin", normalize=True, max_iter=25)
    loadings = rotator.fit_transform(fa.loadings_)

    # Set FA loadings to be rotator loadings
    fa.loadings_ = loadings
    #print (loadings)
Ejemplo n.º 5
0
def score(database, semester, year, season, answer_key, savedname):
    '''
    Modified so that it uses numerical values of question/answer rather than string values.
    By:
        Ilija Nikolov, 5 March 2018
    '''

    '''
        The score function reads in a QuaRCS dataset and answer key file to create a series of columns
        to add to the dataset. The function creates columns for:
        - score on a binary scale (1 for correct, 0 for incorrect)
        - total score
        - totals and means by category
        - number of questions answered
        - total and mean confidence
        Args:
            database: pre or post QuaRCS dataset for a semester
            answer_key: QuaRCS Assessment Answer Key
            semester: 'PRE' or 'PST'
        Output:
            name of file + '_scored' as .csv file
        Example:
            score('QuaRCS_Summer_2017_Pre.csv', 'PRE', QuaRCS Assessment Answer Key.csv', QuaRCS_Summer_2017_Pre )
            New File saved under QuaRCS_Summer_2017_Pre_scored.csv
            Check folder for files
        By:
            Abdoulaye Sanogo, 08/11/2017
        Future Improvements:
            add columns for confidence means and totals by category
            add extra colums after insert so the deletion of columns will not be necessary
    '''

    question = semester + "_Q" # question = 'PRE_Q' or 'PST_Q'

    data = pd.read_csv(database, encoding = 'utf-8', skiprows = [1,2], header = 0)
    df = pd.read_csv(answer_key, encoding = 'utf-8')


    cols = list(data.columns.values)
    c = len(cols)
    e = 0
    h = len(data)

    # Adds the Q#_SCORE column right next to each question
    questions = np.unique(df['Question #'])

    for item in questions:
        if(question+str(item) in data.columns):
            data.insert(data.columns.get_loc(question+str(item))+1,question+str(item)+'_SCORE', 0)

    # e >= 50 --> Full, e < 50 --> Lite
    for d in range(c):
        column = cols[d]
        column = column[0:5]

        if question == column:
            e = e + 1

    data.insert(6 , 'VERSION', " ")

    if e == 50:
        if(year == "16" and season == "Fa"):
            data['VERSION'] = "Fl_2.0"
            # If the value "progress bar" is in comments, change the version to 2.1
            for v in range(h):
                if 'COMMENTS' in data.columns:
                    if (data.loc[v, 'COMMENTS'] == "progress bar"):
                        data.loc[v, 'VERSION'] = "Fl_2.1"
        else:
            data['VERSION'] = "Fl_1.1"
    elif e == 54:
        data['VERSION'] = "Fl_1.0"
        data = data.drop([semester + '_Q18'], axis=1)
        data = data.drop([semester + '_Q18CF'], axis=1)
        data = data.drop([semester + '_Q25'], axis=1)
        data = data.drop([semester + '_Q25CF'], axis=1)
        e = 50
    elif e == 22:
        data['VERSION'] = "Lt_1.0"
    elif e == 30:
        intyr = int(year)
        if (intyr >= 19 or (year == "18" and season == "Fa")):
            data['VERSION'] = "Lt_2.1"
        else:
            data['VERSION'] = "Lt_2.0"
    elif e == 28:
        data['VERSION'] = "SM_1.0"

    # New columns for the totals
    data[semester + '_TOTAL'] = np.nan
    data[semester + '_PCT_TOTAL'] = np.nan
    data[semester + '_GR_TOTAL'] = np.nan
    data[semester + '_GR_MEAN'] = np.nan
    data[semester + '_AR_TOTAL'] = np.nan
    data[semester + '_AR_MEAN'] = np.nan
    data[semester + '_PR_TOTAL'] = np.nan
    data[semester + '_PR_MEAN'] = np.nan
    data[semester + '_PC_TOTAL'] = np.nan
    data[semester + '_PC_MEAN'] = np.nan
    data[semester + '_SP_TOTAL'] = np.nan
    data[semester + '_SP_MEAN'] = np.nan
    data[semester + '_TR_TOTAL'] = np.nan
    data[semester + '_TR_MEAN'] = np.nan
    data[semester + '_AV_TOTAL'] = np.nan
    data[semester + '_AV_MEAN'] = np.nan
    #data[semester + '_ER_MEAN'] = np.nan
    data[semester + '_UD_TOTAL'] = np.nan
    data[semester + '_UD_MEAN'] = np.nan
    data[semester + '_ES_TOTAL'] = np.nan
    data[semester + '_ES_MEAN'] = np.nan

    # Composite Variables
    data[semester + '_SELFEFF'] = np.nan
    data[semester + '_MATHANX'] = np.nan
    data[semester + '_MATHREL'] = np.nan
    data[semester + '_ACADMAT'] = np.nan
    data[semester + '_SCHMATH'] = np.nan

    corr_ans = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0,
                          32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0}
    for item in corr_ans:
        corr_ans[item] = int(list(df.loc[df['Question #']==item]['Correct Answer'])[0])

    # Adds totals and means to total and means columns
    for nn in range(h):
        qn = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0}

        for q_num in qn:
            try:

                if(int(data.loc[nn, question + str(q_num)]) == corr_ans[q_num]):

                    qn[q_num] = 1
                    data.loc[nn, question+str(q_num)+'_SCORE'] = 1
            except:
                pass


        GR = int(np.nansum([qn[15], qn[14], qn[12], qn[29], qn[30], qn[13]]))
        AR = int(np.nansum([qn[15], qn[14], qn[26], qn[27], qn[23], qn[28], qn[19], qn[3], qn[16], qn[31], qn[32], qn[5], qn[6], qn[7], qn[29], qn[30], qn[10], qn[11], qn[20], qn[21], qn[33], qn[34], qn[35]]))
        PR = int(np.nansum([qn[15], qn[12], qn[14], qn[23], qn[28], qn[3], qn[16], qn[7], qn[10], qn[11], qn[20], qn[21], qn[33], qn[35], qn[13]]))
        PC = int(np.nansum([qn[27], qn[3], qn[32], qn[20], qn[21]]))
        SP = int(np.nansum([qn[27], qn[23], qn[28], qn[29], qn[30], qn[20], qn[21]]))
        TR = int(np.nansum([qn[26], qn[27], qn[23]]))
        AV = int(np.nansum([qn[31], qn[10], qn[11], qn[33], qn[34]]))
        UD = int(np.nansum([qn[31], qn[6], qn[7], qn[35], qn[16]]))
        ES = int(np.nansum([qn[15], qn[12], qn[14], qn[16], qn[13]]))
        data.loc[nn, semester + '_GR_TOTAL'] = GR
        data.loc[nn, semester + '_AR_TOTAL'] = AR
        data.loc[nn, semester + '_PR_TOTAL'] = PR
        data.loc[nn, semester + '_PC_TOTAL'] = PC
        data.loc[nn, semester + '_SP_TOTAL'] = SP
        data.loc[nn, semester + '_TR_TOTAL'] = TR
        data.loc[nn, semester + '_AV_TOTAL'] = AV
        data.loc[nn, semester + '_UD_TOTAL'] = UD
        data.loc[nn, semester + '_ES_TOTAL'] = ES
        total_full = 0

        for q_num in qn:
                total_full += qn[q_num]
        if e == 50:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(25)
            data.loc[nn, semester + '_GR_MEAN'] = GR/6
            data.loc[nn, semester + '_AR_MEAN'] = AR/23
            data.loc[nn, semester + '_PR_MEAN'] = PR/15
            data.loc[nn, semester + '_PC_MEAN'] = PC/5
            data.loc[nn, semester + '_SP_MEAN'] = SP/7
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_AV_MEAN'] = AV/5
            data.loc[nn, semester + '_UD_MEAN'] = UD/5
            data.loc[nn, semester + '_ES_MEAN'] = ES/5

        elif e == 22:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(11)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/9
            data.loc[nn, semester + '_PR_MEAN'] = PR/8
            data.loc[nn, semester + '_SP_MEAN'] = SP/3
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_ES_MEAN'] = ES/5

        #lacks number of questions for meaningful subscore
            #1 q
            data.loc[nn, semester + '_UD_MEAN'] = np.nan
            data.loc[nn, semester + '_UD_TOTAL'] = np.nan
            #2 qs
            data.loc[nn, semester + '_PC_MEAN'] = np.nan
            data.loc[nn, semester + '_PC_TOTAL'] = np.nan
            #1 q
            data.loc[nn, semester + '_AV_MEAN'] = np.nan
            data.loc[nn, semester + '_AV_TOTAL'] = np.nan

        elif e == 30:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(15)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/13
            data.loc[nn, semester + '_PR_MEAN'] = PR/11
            data.loc[nn, semester + '_SP_MEAN'] = SP/3
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_AV_MEAN'] = AV/4
            data.loc[nn, semester + '_ES_MEAN'] = ES/5
        #lacks number of questions for meaningful subscore
            #1 q
            data.loc[nn, semester + '_UD_MEAN'] = np.nan
            data.loc[nn, semester + '_UD_TOTAL'] = np.nan
            #2 qs
            data.loc[nn, semester + '_PC_MEAN'] = np.nan
            data.loc[nn, semester + '_PC_TOTAL'] = np.nan

        elif e == 28:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(14)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/13
            data.loc[nn, semester + '_PR_MEAN'] = PR/9
            data.loc[nn, semester + '_PC_MEAN'] = PC/3
            data.loc[nn, semester + '_SP_MEAN'] = SP/7
            data.loc[nn, semester + '_UD_MEAN'] = UD/5
            data.loc[nn, semester + '_ES_MEAN'] = ES/3

        #lacks number of questions for meaningful subscore
            #2 q
            data.loc[nn, semester + '_TR_MEAN'] = np.nan
            data.loc[nn, semester + '_TR_TOTAL'] = np.nan
            #1 q
            data.loc[nn, semester + '_AV_MEAN'] = np.nan
            data.loc[nn, semester + '_AV_TOTAL'] = np.nan



    data[semester  + '_CF_TOTAL'] = np.nan
    data[semester  + '_CF_TOTAL_CORR'] = np.nan
    data[semester  + '_CF_TOTAL_INCORR'] = np.nan
    data[semester + '_CF_MEAN'] = np.nan
    data[semester + '_CF_MEAN_CORR'] = np.nan
    data[semester + '_CF_MEAN_INCORR'] = np.nan


    # Calculates confidence totals and means; adds to respective columns
    for u in range(h):
        qcf = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0}
        qc = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0}

        for q_num in qcf:
            try:
                qcf[q_num] = int(data.loc[u, question + str(q_num) + "CF"])

                qc[q_num] = int(data.loc[u, question + str(q_num) + '_SCORE'])
            except:
                pass

        medscore = 0
        corrscore = 0
        incorrscore = 0
        confcount = 0
        for item in qcf:
            medscore += qcf[item]

            if qcf[item] > 0:
                confcount +=1

                if qc[item] == 1:
                    corrscore += qcf[item]
                else:
                    incorrscore += qcf[item]
        #print(confcount)
        if (confcount == 0):
            confcount = 1
        # Student's score
        numcorr = data.loc[u, semester + '_TOTAL']

        # Calculate confidence scores
        if e == 30:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount

            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

        elif e == 22:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount
            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0
        elif e == 28:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount
            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

        elif e == 50:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount

            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

    data[semester + '_QCOMPLETE'] = 0
    data[semester + '_COMPFLAG'] = 0
    data[semester +'_EFFFLAG'] = 0

    # Counts number of completed columns
    try:
        if e == 50:
            q = [15, 12, 14, 26, 27, 23, 28, 19, 3, 16, 13, 31, 32, 29, 30, 5, 6, 7, 10, 11, 20, 21, 33, 34, 35]
        elif e == 22:
            q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16]
        elif e == 30:
            q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16, 10, 11, 33, 34]
        elif e == 28:
            q = [6, 7, 13, 14, 16, 20, 21, 23, 27, 28, 29, 30, 31, 35]

        for v in range(h):
            # Count up totals
            total = 0
            for w in q:
                count = question + str(w)

                answered = data.loc[v, count]
                if (str(answered) == 'nan' or str(answered) == ' '):
                    continue
                else:
                    total = int(np.nansum([total, 1]))

            data.loc[v, semester + '_QCOMPLETE'] = total

            # Add completed flag
            if total == len(q):
                data.loc[v, semester + '_COMPFLAG'] = 1
            else:
                data.loc[v, semester + '_COMPFLAG'] = 0
    except:
        KeyError

    # Calculating effort column

    for v in range(h):
        # If there is no response for effort, mark completion as 0 for that student!
        if (pd.isnull(data.loc[v, semester + '_EFFORT'])):
            data.loc[v, semester + '_COMPFLAG'] = 0

        # If there is high effort, give full marks in flag
        if data.loc[v, semester + '_EFFORT'] == 4 or data.loc[v, semester + '_EFFORT'] == 5:
            data.loc[v, semester +'_EFFFLAG'] = 1

        # Some effort gives you only so many marks...
        elif data.loc[v, semester + '_EFFORT'] == 3:
            data.loc[v, semester +'_EFFFLAG'] = 0.5

        # NO EFFORT!! :-(
        elif data.loc[v, semester + '_EFFORT'] == 2 or data.loc[v, semester + '_EFFORT'] == 1:
            data.loc[v, semester +'_EFFFLAG'] = 0

    # Factor Analysis!
    if (semester == "PRE" and e == 30) or (semester == "PRE" and e == 22) or (semester == "PRE" and e == 28):
        # Fill out whymajs with 0 instead of NaN values so we can
        # perform FA on them
        nan_columns = [semester + "_WHYMAJ_1", semester + "_WHYMAJ_2", semester + "_WHYMAJ_3",
            semester + "_WHYMAJ_4", semester + "_WHYMAJ_5", semester + "_WHYMAJ_6",
            semester + "_WHYMAJ_7", semester + "_WHYMAJ_8", semester + "_WHYCS_1",
            semester + "_WHYCS_2", semester + "_WHYCS_3", semester + "_WHYCS_4",
            semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_WHYCS_7"
        ]
        for i in data.index:
            for column in nan_columns:
                if pd.isna(data.at[i, column]):
                    data.at[i, column] = 0

        # Factor Analysis variables
        att = [semester + '_FREQEN', semester + '_DAILYM', semester + '_DAILYG',
            semester + '_ATT_DL_3', semester + '_ATT_SC_1', semester + '_ATT_SC_2',
            semester + '_ATT_SC_4', semester + '_ATT_SC_5', semester + '_LK1',
            semester + '_LK2', semester + '_LK5', semester + '_ANX#1_1',
            semester + '_ANX#1_2', semester + '_ANX#1_3', semester + '_ANX#1_4',
            semester + '_CF_TOTAL', semester + '_ATT_DL_2', semester + '_ATT_SC_3',
            semester + "_WHYCS_1", semester + "_WHYCS_3", semester + "_WHYCS_5",
            semester + "_WHYCS_6", semester + "_EFFORT"
        ]

        # Variable selection
        att_data = data.loc[ data[semester + '_COMPFLAG']==1 ]
        att_data = att_data[att]
        # Drop all rows with NaN values
        att_data.dropna(inplace=True)

        swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2',
            '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5'
        ]
        for i in att_data.index:
            for col in swapList:
                swapOrdering(att_data, i, semester + col)

        # KMO and Barlett tests
        X = att_data.copy().values
        X = check_array(X, force_all_finite='allow-nan')

        statistic, p_value = calculate_bartlett_sphericity(X)
        print("\nBarlett sphericity p={0}".format(p_value))
        kmo_per_variable, kmo_total = calculate_kmo(X)
        print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total))

        # Create factor analysis object and perform factor analysis
        # Using maximum likelihood analysis (ml)
        n_factors = 5
        fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml")
        fa.fit(att_data)

        # Kaiser normalization and oblimin rotation
        rotator = Rotator(method="oblimin", normalize=True, max_iter=25)
        loadings = rotator.fit_transform(fa.loadings_)

        # Set FA loadings to be rotator loadings
        fa.loadings_ = loadings

        # Get factor scores
        factor_scores = fa.transform(att_data)
        factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)])
        # print("\nFactor scores: \n", factor_scores)

        factor_names = ["Numerical Self Efficacy", "School Math",
            "Academic maturity", "Numerical Relevancy", "Math Anxiety"]
        # Convert factor loadings to a df
        loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names)

        # Drop non-meaningful values
        loadings = loadings.where(abs(loadings) > 0.32)
        print("Factor loadings: \n", loadings)

        scores1 = factor_scores['Factor 1'].tolist()
        plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Numerical Self Efficacy")
        # plt.show()

        scores2 = factor_scores['Factor 2'].tolist()
        plt.hist(scores2, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("School Math")
        # plt.show()

        scores3 = factor_scores['Factor 3'].tolist()
        plt.hist(scores3, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Academic maturity")
        # plt.show()

        scores4 = factor_scores['Factor 4'].tolist()
        plt.hist(scores4, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Numerical Relevancy")
        # plt.show()

        scores5 = factor_scores['Factor 5'].tolist()
        plt.hist(scores5, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Math Anxiety")
        # plt.show()

        # Update composite variables
        for i in factor_scores.index:
            data.at[i, semester + '_SELFEFF'] = factor_scores.at[i, 'Factor 1']
            data.at[i, semester + '_SCHMATH'] = factor_scores.at[i, 'Factor 2']
            data.at[i, semester + '_ACADMAT'] = factor_scores.at[i, 'Factor 3']
            data.at[i, semester + '_MATHREL'] = factor_scores.at[i, 'Factor 4']
            data.at[i, semester + '_MATHANX'] = factor_scores.at[i, 'Factor 5']

    #data.to_csv(semester+"_scored.csv", encoding='utf-8',index=False)

    #print("Results saved to " + savedname + "_scored.csv")

    return data
Ejemplo n.º 6
0
def factor_analysis(factor_df, max_feature_count=None, plot=True):
    """
    因子分析,提取N个特征,查看是否有效
    :param factor_df:
    :param max_feature_count:
    :param plot:
    :return:
    """
    ana_dic = {}
    max_feature_count = np.min(
        [factor_df.shape[1] //
         3, 50] if max_feature_count is None else max_feature_count)
    for n_features in range(2, max_feature_count):
        logger.info(f"{n_features} 个因子时:")
        fa = FactorAnalyzer(n_factors=n_features, rotation=None)
        exception = None
        for _ in range(8, 0, -1):
            df = factor_df if _ == 0 else factor_df.sample(
                factor_df.shape[0] // (_ + 1) * _)
            try:
                fa.fit(df)
                break
            except LinAlgError as exp:
                exception = exp
                logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样",
                                 df.shape, _, _)
                logger.warning(exception is None)
        else:
            logger.warning(exception is None)
            raise exception from exception

        communalities = fa.get_communalities()
        logger.info(f"\t共因子方差比(communality)({communalities.shape})")  # 公因子方差
        # logger.debug('\n%s', communalities)
        loadings = fa.loadings_
        logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})")  # 成分矩阵
        # logger.debug('\n%s', loadings)  # 成分矩阵
        var = fa.get_factor_variance()  # 给出贡献率
        # 1. Sum of squared loadings (variance)
        # 2. Proportional variance
        # 3. Cumulative variance
        logger.info(f"\tCumulative variance {var[2]}")
        kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df))
        if kmo_total < 0.6:
            logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析')
        else:
            logger.info(
                f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析')
        ana_dic[n_features] = {
            "FactorAnalyzer": fa,
            # "communalities": communalities,
            # "loadings": loadings,
            # "Sum of squared loadings": var[0],
            # "Proportional variance": var[1],
            "Cumulative variance": var[2][-1],
            "KOM_Test_total": kmo_total,
        }
        if var[2][-1] > 0.95 and kmo_total > 0.6:
            break

    ana_data = pd.DataFrame(
        {k: v
         for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T
    if plot:
        ana_data.plot(subplots=True, figsize=(9, 6))
        plt.show()

    return ana_dic
Ejemplo n.º 7
0
x=data[['企业单位数','流动资产','资产总额','负债总额','主营业务','利润总额','销售利润率']]

#SPSS 使用了principal
#网上的一个解释 其中第一主成分对初始变量集的方差解释性最大,
#随后的每一个主成分都最大化它对方差的解释程度,同时与之前所有的主成分都正交
#然后方差旋转使用正交旋转,各个因子彼此独立。表现为载荷矩阵中的元素更倾向于与0和±1
#
fa = FactorAnalyzer(n_factors=2,rotation='varimax',method='principal')
fa.fit(x)
#原始相关性矩阵
fa.corr_

#检验,相关矩阵是否是单位矩阵
calculate_bartlett_sphericity(x)
#KMO检验,总得KMO应该大于0.6
calculate_kmo(x)
#公因子方差
fa.get_communalities()
#载荷矩阵
fa.loadings_

#因子贡献率
#variance 因素方差,proportional_variance 比例因子方差,cumulative_variances 累计比例因子方差
fa.get_factor_variance()

#没有提供得分矩阵,得分矩阵一般用于将原始数据转换为因子分析后的数据.可以使用transform函数

#斜交例子
model = FactorAnalyzer(n_factors=2,rotation='promax',method='principal')
model.fit(x)
#pattern matrix
Ejemplo n.º 8
0
%matplotlib inline

# Data: Boston housing price
X = datasets.load_boston().data 
y = datasets.load_boston().target.reshape(-1,1)
data = np.concatenate((X,y), axis=1)
names=['00犯罪率','01宅用地比例','02商用地比例','03临近河道','04NO浓度', '05房间数', '06自用房比例',
       '07到城区距离', '08交通便利指数', '09税率', '10师生比例', '11黑人比例', '12低收入比例', '13房价']
df = pd.DataFrame(X, columns=names[:-1])


""" 1. 充分性检验:检测数据集中是否可提取出公共因子
       所用方法为"MKO检验"和"巴特利特球度检验",都是检验各变量之间相关性的方法 
"""
# 方法 1:MKO检验 (Kaiser-Meyer-Olkin Test)
kmo = factor_analyzer.calculate_kmo(df)
print(kmo[1])  # >>> 0.8530376701576892 (大于0.6 说明可做因子分析)
# 方法 2:巴特利特球度检验 (Bartlett's sphericity Test)
bartlett = factor_analyzer.calculate_bartlett_sphericity(df)
print(bartlett[0]) # >>> 4474 (需要自行把握是否大于显著性水平,若小于预期显著性水平,则不可做因子分析)



""" 2. 验证性因子分析 """

''' 2.1 确定公共因子个数 '''
# 由于不确定公共因子个数,故令 n_factors = 变量总数
fa = factor_analyzer.FactorAnalyzer(n_factors=13, rotation=None)
fa.fit(df)
# 计算特征值 (从特征值大小可判断公共因子个数)
eigval, eigvec = fa.get_eigenvalues()