def check_rotation(test_name,
                   factors,
                   method,
                   rotation,
                   rel_tol=0,
                   abs_tol=0.1,
                   **kwargs):
    """
    Check the rotation results.

    Parameters
    ----------
    test_name : str
        The name of the test (e.g. 'test01')
    factors : int
        The number of factors.
    method : str
        The rotation method (e.g. 'uls')
    rotation : str
        The type of rotation (e.g. 'varimax')
    rel_tol : float, optional
        The relative tolerance.
        Defaults to 0.0.
    abs_tol : float, optional
        The absolute tolerance.
        Defaults to 0.1.

    Returns
    ------
    check : float
        The proportion that match between
        the calculated and expected.
    """

    r_input = collect_r_output(test_name,
                               factors,
                               method,
                               'none',
                               output_types=['loading'])
    r_loading = r_input['loading']
    r_loading = normalize(r_loading, absolute=False)

    rotator = Rotator(method=rotation, **kwargs)
    rotated_loading = rotator.fit_transform(r_loading)

    r_output = collect_r_output(test_name,
                                factors,
                                method,
                                rotation,
                                output_types=['loading'])
    expected_loading = r_output['loading']

    data1 = normalize(rotated_loading)
    data2 = normalize(expected_loading)

    return check_close(data1, data2, rel_tol, abs_tol)
def _out_load_matrix(df2, a_columns, m):
    fa = factor_analyzer.FactorAnalyzer(n_factors=m,
                                        method='principal',
                                        rotation='varimax')
    fa.fit(df2)
    # # 旋转后的因子载荷矩阵
    rotator = Rotator()
    load_matrix = pd.DataFrame(rotator.fit_transform(fa.loadings_),
                               columns=a_columns,
                               index=df2.columns)
    print("\n因子旋转:\n", load_matrix)

    out_load_matrix_name = pd.DataFrame([[None], ['因子旋转']])
    out_load_matrix_header = pd.DataFrame([a_columns])
    out_load_matrix_data = pd.DataFrame(rotator.fit_transform(fa.loadings_))
    out_load_matrix = pd.concat(
        [out_load_matrix_name, out_load_matrix_header, out_load_matrix_data])
    out_load_matrix_dum_index = [None, None, None]
    out_load_matrix_dum_index.extend(df2.columns)
    out_load_matrix.insert(0, 'index', out_load_matrix_dum_index)
    out_load_matrix.columns = np.arange(out_load_matrix.shape[1])
    print(out_load_matrix)
    return load_matrix, out_load_matrix
 def varimax(self):
     fa = FactorAnalyzer(rotation=None)
     rotator = Rotator()
     a = rotator.fit(self.loading3)
     return a.rotation_, a.loadings_
Esempio n. 4
0
    def varclus(self, speedup=True):

        self.speedup = speedup

        if self.speedup is True:
            return self._varclusspu()

        ClusInfo = collections.namedtuple(
            'ClusInfo', ['clus', 'eigval1', 'eigval2', 'pc1', 'varprop'])
        c_eigvals, _, c_princomps, c_varprops = VarClusHi.pca(
            self.df[self.feat_list])
        clus0 = ClusInfo(clus=self.feat_list,
                         eigval1=c_eigvals[0],
                         eigval2=c_eigvals[1],
                         pc1=c_princomps[:, 0],
                         varprop=c_varprops[0])
        self.clusters = collections.OrderedDict([(0, clus0)])

        while (True):

            if self.maxclus is not None and len(self.clusters) >= self.maxclus:
                break

            idx = max(self.clusters,
                      key=lambda x: self.clusters.get(x).eigval2)
            if self.clusters[idx].eigval2 > self.maxeigval2:
                split_clus = self.clusters[idx].clus
                c_eigvals, c_eigvecs, _, _ = VarClusHi.pca(self.df[split_clus])
            else:
                break

            if c_eigvals[1] > self.maxeigval2:
                clus1, clus2 = [], []
                rotator = Rotator(method='quartimax')
                r_eigvecs = rotator.fit_transform(pd.DataFrame(c_eigvecs))
                stand_df = (self.df - self.df.mean()) / self.df.std()
                r_pcs = np.dot(stand_df[split_clus].values, r_eigvecs)

                for feat in split_clus:

                    corr_pc1 = np.corrcoef(self.df[feat].values.T,
                                           r_pcs[:, 0])[0, 1]
                    corr_pc2 = np.corrcoef(self.df[feat].values.T,
                                           r_pcs[:, 1])[0, 1]

                    if abs(corr_pc1) > abs(corr_pc2):
                        clus1.append(feat)
                    else:
                        clus2.append(feat)

                fin_clus1, fin_clus2, _ = VarClusHi._reassign_rs(
                    self.df, clus1, clus2, self.n_rs)
                c1_eigvals, _, c1_princomps, c1_varprops = VarClusHi.pca(
                    self.df[fin_clus1])
                c2_eigvals, _, c2_princomps, c2_varprops = VarClusHi.pca(
                    self.df[fin_clus2])

                self.clusters[idx] = ClusInfo(clus=fin_clus1,
                                              eigval1=c1_eigvals[0],
                                              eigval2=c1_eigvals[1],
                                              pc1=c1_princomps[:, 0],
                                              varprop=c1_varprops[0])
                self.clusters[len(self.clusters)] = ClusInfo(
                    clus=fin_clus2,
                    eigval1=c2_eigvals[0],
                    eigval2=c2_eigvals[1],
                    pc1=c2_princomps[:, 0],
                    varprop=c2_varprops[0])
            else:
                break

        return self
Esempio n. 5
0
    def _varclusspu(self):

        ClusInfo = collections.namedtuple(
            'ClusInfo', ['clus', 'eigval1', 'eigval2', 'eigvecs', 'varprop'])
        c_eigvals, c_eigvecs, c_corrs, c_varprops = VarClusHi.correig(
            self.df[self.feat_list])

        self.corrs = c_corrs

        clus0 = ClusInfo(clus=self.feat_list,
                         eigval1=c_eigvals[0],
                         eigval2=c_eigvals[1],
                         eigvecs=c_eigvecs,
                         varprop=c_varprops[0])
        self.clusters = collections.OrderedDict([(0, clus0)])

        while (True):

            if self.maxclus is not None and len(self.clusters) >= self.maxclus:
                break

            idx = max(self.clusters,
                      key=lambda x: self.clusters.get(x).eigval2)
            if self.clusters[idx].eigval2 > self.maxeigval2:
                split_clus = self.clusters[idx].clus
                c_eigvals, c_eigvecs, split_corrs, _ = VarClusHi.correig(
                    self.df[split_clus])
            else:
                break

            if c_eigvals[1] > self.maxeigval2:
                clus1, clus2 = [], []
                rotator = Rotator(method='quartimax')
                r_eigvecs = rotator.fit_transform(pd.DataFrame(c_eigvecs))

                comb_sigma1 = math.sqrt(
                    np.dot(np.dot(r_eigvecs[:, 0], split_corrs.values),
                           r_eigvecs[:, 0].T))
                comb_sigma2 = math.sqrt(
                    np.dot(np.dot(r_eigvecs[:, 1], split_corrs.values),
                           r_eigvecs[:, 1].T))

                for feat in split_clus:

                    comb_cov1 = np.dot(r_eigvecs[:, 0],
                                       split_corrs[feat].values.T)
                    comb_cov2 = np.dot(r_eigvecs[:, 1],
                                       split_corrs[feat].values.T)

                    corr_pc1 = comb_cov1 / comb_sigma1
                    corr_pc2 = comb_cov2 / comb_sigma2

                    if abs(corr_pc1) > abs(corr_pc2):
                        clus1.append(feat)
                    else:
                        clus2.append(feat)

                fin_clus1, fin_clus2, _ = VarClusHi._reassign_rs(
                    self.df, clus1, clus2, self.n_rs)
                c1_eigvals, c1_eigvecs, _, c1_varprops = VarClusHi.correig(
                    self.df[fin_clus1])
                c2_eigvals, c2_eigvecs, _, c2_varprops = VarClusHi.correig(
                    self.df[fin_clus2])

                self.clusters[idx] = ClusInfo(clus=fin_clus1,
                                              eigval1=c1_eigvals[0],
                                              eigval2=c1_eigvals[1],
                                              eigvecs=c1_eigvecs,
                                              varprop=c1_varprops[0])
                self.clusters[len(self.clusters)] = ClusInfo(
                    clus=fin_clus2,
                    eigval1=c2_eigvals[0],
                    eigval2=c2_eigvals[1],
                    eigvecs=c2_eigvecs,
                    varprop=c2_varprops[0])
            else:
                break

        return self
Esempio n. 6
0
for i in range(int(len(a.columns.tolist()))):
    q = 'F_' + str(i + 1)
    a_list.append(q)
    ma_list.append(q)
a.columns = a_list
ma.columns = ma_list
a.index = xlabel
ma.index = xlabel
component_matrix = copy.deepcopy(ma)
component_matrix.insert(0, u'列名', ma.index)
#  样本的相关系数矩阵,旋转成分矩阵,重点需要看的地方!!!!!!!!!!!!!!!!!
# 如果rowvar是1(默认),那么每行代表一个variables,每列代表一个observations(样本);反之则每行是observations,每列是variables
covr = np.corrcoef(data, rowvar=0)  # 计算相关系数矩阵
covrr = pd.DataFrame(covr)

rotator = Rotator()
test1 = covrr.T
test2 = test1.T
covrr = covrr.T.values.tolist()
covrr = rotator.rotate(pd.DataFrame(covrr), 'varimax')
covrr = pd.DataFrame(list(covrr)[0].values.tolist())
# 因子得分系数矩阵,对应成分得分矩阵
b = np.dot(np.linalg.inv(covr), a)
score_matrix = pd.DataFrame(b)
# rotate stop
b_list = []
for i in range(int(len(score_matrix.columns.tolist()))):
    q = 'F_' + str(i + 1)
    b_list.append(q)
score_matrix.columns = b_list
score_matrix.insert(0, u'列名', a.index)
Esempio n. 7
0
def _varimax(X):
    return Rotator(normalize=False).fit_transform(X)
Esempio n. 8
0
    def varclus(self):
        """
        varclus的步骤
        :return:
        """
        ClusInfo = collections.namedtuple('ClusInfo', ['cluster', 'eigval1', 'eigval2', 'eigvecs', 'varprop'])
        c_eigvals, c_eigvecs, c_corrs, c_varprops = VarClus.correig(self.df[self.feat_list])

        self.corrs = c_corrs

        cluster0 = ClusInfo(cluster=self.feat_list,
                            eigval1=c_eigvals[0],
                            eigval2=c_eigvals[1],
                            eigvecs=c_eigvecs,
                            varprop=c_varprops[0]
                            )
        self.clusters = collections.OrderedDict([(0, cluster0)])

        while True:

            # 超过最大的cluster数目则停止
            if self.max_cluster is not None and len(self.clusters) >= self.max_cluster:
                break

            # 找出eigval2最大的cluster
            idx = max(self.clusters, key=lambda x: self.clusters.get(x).eigval2)

            # 如果最大的eigval2小于等于设定的阈值,则停止
            if self.clusters[idx].eigval2 <= self.max_second_eig_val:
                break

            # 分裂成两个簇的过程为:先计算该簇的前两个主成分,再进行斜交旋转,
            # 并把每个变量分配到旋转分量对应的簇里,分配的原则是变量与这个主成分相关系数的绝对值最大
            c_cluster = self.clusters[idx].cluster
            c_eigvals, c_eigvecs, c_corrs, _ = VarClus.correig(self.df[c_cluster])

            cluster1, cluster2 = [], []
            rotator = Rotator(method='quartimax')
            r_eigvecs = rotator.fit_transform(pd.DataFrame(c_eigvecs))

            comb_sigma1 = math.sqrt(np.dot(np.dot(r_eigvecs[:, 0], c_corrs.values), r_eigvecs[:, 0].T))
            comb_sigma2 = math.sqrt(np.dot(np.dot(r_eigvecs[:, 1], c_corrs.values), r_eigvecs[:, 1].T))

            for feat in c_cluster:

                comb_cov1 = np.dot(r_eigvecs[:, 0], c_corrs[feat].values.T)
                comb_cov2 = np.dot(r_eigvecs[:, 1], c_corrs[feat].values.T)

                corr_pc1 = comb_cov1 / comb_sigma1
                corr_pc2 = comb_cov2 / comb_sigma2

                if abs(corr_pc1) > abs(corr_pc2):
                    cluster1.append(feat)
                else:
                    cluster2.append(feat)

            fin_cluster1, fin_cluster2, _ = VarClus._reassign_rs(self.df, cluster1, cluster2, self.n_rs)
            c1_eigvals, c1_eigvecs, _, c1_varprops = VarClus.correig(self.df[fin_cluster1])
            c2_eigvals, c2_eigvecs, _, c2_varprops = VarClus.correig(self.df[fin_cluster2])

            self.clusters[idx] = ClusInfo(cluster=fin_cluster1,
                                          eigval1=c1_eigvals[0],
                                          eigval2=c1_eigvals[1],
                                          eigvecs=c1_eigvecs,
                                          varprop=c1_varprops[0]
                                          )
            self.clusters[len(self.clusters)] = ClusInfo(cluster=fin_cluster2,
                                                         eigval1=c2_eigvals[0],
                                                         eigval2=c2_eigvals[1],
                                                         eigvecs=c2_eigvecs,
                                                         varprop=c2_varprops[0]
                                                         )
Esempio n. 9
0
    X = att_data.copy().values
    X = check_array(X, force_all_finite='allow-nan')

    statistic, p_value = calculate_bartlett_sphericity(X)
    #print("\nBarlett sphericity p={0}".format(p_value))
    kmo_per_variable, kmo_total = calculate_kmo(X)
    #print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total))

    # Create factor analysis object and perform factor analysis
    # Using maximum likelihood analysis (ml)
    n_factors = 5
    fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml")
    fa.fit(att_data)

    # Kaiser normalization and oblimin rotation
    rotator = Rotator(method="oblimin", normalize=True, max_iter=25)
    loadings = rotator.fit_transform(fa.loadings_)

    # Set FA loadings to be rotator loadings
    fa.loadings_ = loadings
    #print (loadings)

    # Get factor scores
    factor_scores = fa.transform(att_data)
    factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)])
    #print("\nFactor scores: \n", factor_scores)

    factor_names = ["Numerical Self Efficacy", "School Math",
        "Academic maturity", "Numerical Relevancy", "Math Anxiety"]
    # Convert factor loadings to a df
    loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names)
Esempio n. 10
0
plt.grid()
plt.show()

fa = FactorAnalyzer(n_factors=2, method='principal', rotation='varimax')
fa.fit(datas)
# 公因子方差
print(fa.get_communalities())
# 特征值
print("\n特征值:\n", fa.get_factor_variance()[0])
# 方差贡献率
print("\n方差贡献率:\n", fa.get_factor_variance()[1])
# 累积方差贡献率
print("\n累计方差贡献率:\n", fa.get_factor_variance()[2])

print("\n成分矩阵:\n", fa.loadings_)
rotator = Rotator()
load_matrix = rotator.fit_transform(fa.loadings_)
print(load_matrix)

# 因子得分系数矩阵
# 相关系数
corr = datas.corr()
# array转matrix
corr = np.mat(corr)

# 因子载荷矩阵
load_matrix = np.mat(load_matrix)
pr_matrix_score = np.dot(nlg.inv(corr), load_matrix)
print(pr_matrix_score)

Esempio n. 11
0
def score(database, semester, year, season, answer_key, savedname):
    '''
    Modified so that it uses numerical values of question/answer rather than string values.
    By:
        Ilija Nikolov, 5 March 2018
    '''

    '''
        The score function reads in a QuaRCS dataset and answer key file to create a series of columns
        to add to the dataset. The function creates columns for:
        - score on a binary scale (1 for correct, 0 for incorrect)
        - total score
        - totals and means by category
        - number of questions answered
        - total and mean confidence
        Args:
            database: pre or post QuaRCS dataset for a semester
            answer_key: QuaRCS Assessment Answer Key
            semester: 'PRE' or 'PST'
        Output:
            name of file + '_scored' as .csv file
        Example:
            score('QuaRCS_Summer_2017_Pre.csv', 'PRE', QuaRCS Assessment Answer Key.csv', QuaRCS_Summer_2017_Pre )
            New File saved under QuaRCS_Summer_2017_Pre_scored.csv
            Check folder for files
        By:
            Abdoulaye Sanogo, 08/11/2017
        Future Improvements:
            add columns for confidence means and totals by category
            add extra colums after insert so the deletion of columns will not be necessary
    '''

    question = semester + "_Q" # question = 'PRE_Q' or 'PST_Q'

    data = pd.read_csv(database, encoding = 'utf-8', skiprows = [1,2], header = 0)
    df = pd.read_csv(answer_key, encoding = 'utf-8')


    cols = list(data.columns.values)
    c = len(cols)
    e = 0
    h = len(data)

    # Adds the Q#_SCORE column right next to each question
    questions = np.unique(df['Question #'])

    for item in questions:
        if(question+str(item) in data.columns):
            data.insert(data.columns.get_loc(question+str(item))+1,question+str(item)+'_SCORE', 0)

    # e >= 50 --> Full, e < 50 --> Lite
    for d in range(c):
        column = cols[d]
        column = column[0:5]

        if question == column:
            e = e + 1

    data.insert(6 , 'VERSION', " ")

    if e == 50:
        if(year == "16" and season == "Fa"):
            data['VERSION'] = "Fl_2.0"
            # If the value "progress bar" is in comments, change the version to 2.1
            for v in range(h):
                if 'COMMENTS' in data.columns:
                    if (data.loc[v, 'COMMENTS'] == "progress bar"):
                        data.loc[v, 'VERSION'] = "Fl_2.1"
        else:
            data['VERSION'] = "Fl_1.1"
    elif e == 54:
        data['VERSION'] = "Fl_1.0"
        data = data.drop([semester + '_Q18'], axis=1)
        data = data.drop([semester + '_Q18CF'], axis=1)
        data = data.drop([semester + '_Q25'], axis=1)
        data = data.drop([semester + '_Q25CF'], axis=1)
        e = 50
    elif e == 22:
        data['VERSION'] = "Lt_1.0"
    elif e == 30:
        intyr = int(year)
        if (intyr >= 19 or (year == "18" and season == "Fa")):
            data['VERSION'] = "Lt_2.1"
        else:
            data['VERSION'] = "Lt_2.0"
    elif e == 28:
        data['VERSION'] = "SM_1.0"

    # New columns for the totals
    data[semester + '_TOTAL'] = np.nan
    data[semester + '_PCT_TOTAL'] = np.nan
    data[semester + '_GR_TOTAL'] = np.nan
    data[semester + '_GR_MEAN'] = np.nan
    data[semester + '_AR_TOTAL'] = np.nan
    data[semester + '_AR_MEAN'] = np.nan
    data[semester + '_PR_TOTAL'] = np.nan
    data[semester + '_PR_MEAN'] = np.nan
    data[semester + '_PC_TOTAL'] = np.nan
    data[semester + '_PC_MEAN'] = np.nan
    data[semester + '_SP_TOTAL'] = np.nan
    data[semester + '_SP_MEAN'] = np.nan
    data[semester + '_TR_TOTAL'] = np.nan
    data[semester + '_TR_MEAN'] = np.nan
    data[semester + '_AV_TOTAL'] = np.nan
    data[semester + '_AV_MEAN'] = np.nan
    #data[semester + '_ER_MEAN'] = np.nan
    data[semester + '_UD_TOTAL'] = np.nan
    data[semester + '_UD_MEAN'] = np.nan
    data[semester + '_ES_TOTAL'] = np.nan
    data[semester + '_ES_MEAN'] = np.nan

    # Composite Variables
    data[semester + '_SELFEFF'] = np.nan
    data[semester + '_MATHANX'] = np.nan
    data[semester + '_MATHREL'] = np.nan
    data[semester + '_ACADMAT'] = np.nan
    data[semester + '_SCHMATH'] = np.nan

    corr_ans = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0,
                          32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0}
    for item in corr_ans:
        corr_ans[item] = int(list(df.loc[df['Question #']==item]['Correct Answer'])[0])

    # Adds totals and means to total and means columns
    for nn in range(h):
        qn = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0}

        for q_num in qn:
            try:

                if(int(data.loc[nn, question + str(q_num)]) == corr_ans[q_num]):

                    qn[q_num] = 1
                    data.loc[nn, question+str(q_num)+'_SCORE'] = 1
            except:
                pass


        GR = int(np.nansum([qn[15], qn[14], qn[12], qn[29], qn[30], qn[13]]))
        AR = int(np.nansum([qn[15], qn[14], qn[26], qn[27], qn[23], qn[28], qn[19], qn[3], qn[16], qn[31], qn[32], qn[5], qn[6], qn[7], qn[29], qn[30], qn[10], qn[11], qn[20], qn[21], qn[33], qn[34], qn[35]]))
        PR = int(np.nansum([qn[15], qn[12], qn[14], qn[23], qn[28], qn[3], qn[16], qn[7], qn[10], qn[11], qn[20], qn[21], qn[33], qn[35], qn[13]]))
        PC = int(np.nansum([qn[27], qn[3], qn[32], qn[20], qn[21]]))
        SP = int(np.nansum([qn[27], qn[23], qn[28], qn[29], qn[30], qn[20], qn[21]]))
        TR = int(np.nansum([qn[26], qn[27], qn[23]]))
        AV = int(np.nansum([qn[31], qn[10], qn[11], qn[33], qn[34]]))
        UD = int(np.nansum([qn[31], qn[6], qn[7], qn[35], qn[16]]))
        ES = int(np.nansum([qn[15], qn[12], qn[14], qn[16], qn[13]]))
        data.loc[nn, semester + '_GR_TOTAL'] = GR
        data.loc[nn, semester + '_AR_TOTAL'] = AR
        data.loc[nn, semester + '_PR_TOTAL'] = PR
        data.loc[nn, semester + '_PC_TOTAL'] = PC
        data.loc[nn, semester + '_SP_TOTAL'] = SP
        data.loc[nn, semester + '_TR_TOTAL'] = TR
        data.loc[nn, semester + '_AV_TOTAL'] = AV
        data.loc[nn, semester + '_UD_TOTAL'] = UD
        data.loc[nn, semester + '_ES_TOTAL'] = ES
        total_full = 0

        for q_num in qn:
                total_full += qn[q_num]
        if e == 50:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(25)
            data.loc[nn, semester + '_GR_MEAN'] = GR/6
            data.loc[nn, semester + '_AR_MEAN'] = AR/23
            data.loc[nn, semester + '_PR_MEAN'] = PR/15
            data.loc[nn, semester + '_PC_MEAN'] = PC/5
            data.loc[nn, semester + '_SP_MEAN'] = SP/7
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_AV_MEAN'] = AV/5
            data.loc[nn, semester + '_UD_MEAN'] = UD/5
            data.loc[nn, semester + '_ES_MEAN'] = ES/5

        elif e == 22:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(11)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/9
            data.loc[nn, semester + '_PR_MEAN'] = PR/8
            data.loc[nn, semester + '_SP_MEAN'] = SP/3
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_ES_MEAN'] = ES/5

        #lacks number of questions for meaningful subscore
            #1 q
            data.loc[nn, semester + '_UD_MEAN'] = np.nan
            data.loc[nn, semester + '_UD_TOTAL'] = np.nan
            #2 qs
            data.loc[nn, semester + '_PC_MEAN'] = np.nan
            data.loc[nn, semester + '_PC_TOTAL'] = np.nan
            #1 q
            data.loc[nn, semester + '_AV_MEAN'] = np.nan
            data.loc[nn, semester + '_AV_TOTAL'] = np.nan

        elif e == 30:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(15)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/13
            data.loc[nn, semester + '_PR_MEAN'] = PR/11
            data.loc[nn, semester + '_SP_MEAN'] = SP/3
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_AV_MEAN'] = AV/4
            data.loc[nn, semester + '_ES_MEAN'] = ES/5
        #lacks number of questions for meaningful subscore
            #1 q
            data.loc[nn, semester + '_UD_MEAN'] = np.nan
            data.loc[nn, semester + '_UD_TOTAL'] = np.nan
            #2 qs
            data.loc[nn, semester + '_PC_MEAN'] = np.nan
            data.loc[nn, semester + '_PC_TOTAL'] = np.nan

        elif e == 28:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(14)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/13
            data.loc[nn, semester + '_PR_MEAN'] = PR/9
            data.loc[nn, semester + '_PC_MEAN'] = PC/3
            data.loc[nn, semester + '_SP_MEAN'] = SP/7
            data.loc[nn, semester + '_UD_MEAN'] = UD/5
            data.loc[nn, semester + '_ES_MEAN'] = ES/3

        #lacks number of questions for meaningful subscore
            #2 q
            data.loc[nn, semester + '_TR_MEAN'] = np.nan
            data.loc[nn, semester + '_TR_TOTAL'] = np.nan
            #1 q
            data.loc[nn, semester + '_AV_MEAN'] = np.nan
            data.loc[nn, semester + '_AV_TOTAL'] = np.nan



    data[semester  + '_CF_TOTAL'] = np.nan
    data[semester  + '_CF_TOTAL_CORR'] = np.nan
    data[semester  + '_CF_TOTAL_INCORR'] = np.nan
    data[semester + '_CF_MEAN'] = np.nan
    data[semester + '_CF_MEAN_CORR'] = np.nan
    data[semester + '_CF_MEAN_INCORR'] = np.nan


    # Calculates confidence totals and means; adds to respective columns
    for u in range(h):
        qcf = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0}
        qc = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0}

        for q_num in qcf:
            try:
                qcf[q_num] = int(data.loc[u, question + str(q_num) + "CF"])

                qc[q_num] = int(data.loc[u, question + str(q_num) + '_SCORE'])
            except:
                pass

        medscore = 0
        corrscore = 0
        incorrscore = 0
        confcount = 0
        for item in qcf:
            medscore += qcf[item]

            if qcf[item] > 0:
                confcount +=1

                if qc[item] == 1:
                    corrscore += qcf[item]
                else:
                    incorrscore += qcf[item]
        #print(confcount)
        if (confcount == 0):
            confcount = 1
        # Student's score
        numcorr = data.loc[u, semester + '_TOTAL']

        # Calculate confidence scores
        if e == 30:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount

            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

        elif e == 22:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount
            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0
        elif e == 28:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount
            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

        elif e == 50:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount

            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

    data[semester + '_QCOMPLETE'] = 0
    data[semester + '_COMPFLAG'] = 0
    data[semester +'_EFFFLAG'] = 0

    # Counts number of completed columns
    try:
        if e == 50:
            q = [15, 12, 14, 26, 27, 23, 28, 19, 3, 16, 13, 31, 32, 29, 30, 5, 6, 7, 10, 11, 20, 21, 33, 34, 35]
        elif e == 22:
            q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16]
        elif e == 30:
            q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16, 10, 11, 33, 34]
        elif e == 28:
            q = [6, 7, 13, 14, 16, 20, 21, 23, 27, 28, 29, 30, 31, 35]

        for v in range(h):
            # Count up totals
            total = 0
            for w in q:
                count = question + str(w)

                answered = data.loc[v, count]
                if (str(answered) == 'nan' or str(answered) == ' '):
                    continue
                else:
                    total = int(np.nansum([total, 1]))

            data.loc[v, semester + '_QCOMPLETE'] = total

            # Add completed flag
            if total == len(q):
                data.loc[v, semester + '_COMPFLAG'] = 1
            else:
                data.loc[v, semester + '_COMPFLAG'] = 0
    except:
        KeyError

    # Calculating effort column

    for v in range(h):
        # If there is no response for effort, mark completion as 0 for that student!
        if (pd.isnull(data.loc[v, semester + '_EFFORT'])):
            data.loc[v, semester + '_COMPFLAG'] = 0

        # If there is high effort, give full marks in flag
        if data.loc[v, semester + '_EFFORT'] == 4 or data.loc[v, semester + '_EFFORT'] == 5:
            data.loc[v, semester +'_EFFFLAG'] = 1

        # Some effort gives you only so many marks...
        elif data.loc[v, semester + '_EFFORT'] == 3:
            data.loc[v, semester +'_EFFFLAG'] = 0.5

        # NO EFFORT!! :-(
        elif data.loc[v, semester + '_EFFORT'] == 2 or data.loc[v, semester + '_EFFORT'] == 1:
            data.loc[v, semester +'_EFFFLAG'] = 0

    # Factor Analysis!
    if (semester == "PRE" and e == 30) or (semester == "PRE" and e == 22) or (semester == "PRE" and e == 28):
        # Fill out whymajs with 0 instead of NaN values so we can
        # perform FA on them
        nan_columns = [semester + "_WHYMAJ_1", semester + "_WHYMAJ_2", semester + "_WHYMAJ_3",
            semester + "_WHYMAJ_4", semester + "_WHYMAJ_5", semester + "_WHYMAJ_6",
            semester + "_WHYMAJ_7", semester + "_WHYMAJ_8", semester + "_WHYCS_1",
            semester + "_WHYCS_2", semester + "_WHYCS_3", semester + "_WHYCS_4",
            semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_WHYCS_7"
        ]
        for i in data.index:
            for column in nan_columns:
                if pd.isna(data.at[i, column]):
                    data.at[i, column] = 0

        # Factor Analysis variables
        att = [semester + '_FREQEN', semester + '_DAILYM', semester + '_DAILYG',
            semester + '_ATT_DL_3', semester + '_ATT_SC_1', semester + '_ATT_SC_2',
            semester + '_ATT_SC_4', semester + '_ATT_SC_5', semester + '_LK1',
            semester + '_LK2', semester + '_LK5', semester + '_ANX#1_1',
            semester + '_ANX#1_2', semester + '_ANX#1_3', semester + '_ANX#1_4',
            semester + '_CF_TOTAL', semester + '_ATT_DL_2', semester + '_ATT_SC_3',
            semester + "_WHYCS_1", semester + "_WHYCS_3", semester + "_WHYCS_5",
            semester + "_WHYCS_6", semester + "_EFFORT"
        ]

        # Variable selection
        att_data = data.loc[ data[semester + '_COMPFLAG']==1 ]
        att_data = att_data[att]
        # Drop all rows with NaN values
        att_data.dropna(inplace=True)

        swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2',
            '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5'
        ]
        for i in att_data.index:
            for col in swapList:
                swapOrdering(att_data, i, semester + col)

        # KMO and Barlett tests
        X = att_data.copy().values
        X = check_array(X, force_all_finite='allow-nan')

        statistic, p_value = calculate_bartlett_sphericity(X)
        print("\nBarlett sphericity p={0}".format(p_value))
        kmo_per_variable, kmo_total = calculate_kmo(X)
        print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total))

        # Create factor analysis object and perform factor analysis
        # Using maximum likelihood analysis (ml)
        n_factors = 5
        fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml")
        fa.fit(att_data)

        # Kaiser normalization and oblimin rotation
        rotator = Rotator(method="oblimin", normalize=True, max_iter=25)
        loadings = rotator.fit_transform(fa.loadings_)

        # Set FA loadings to be rotator loadings
        fa.loadings_ = loadings

        # Get factor scores
        factor_scores = fa.transform(att_data)
        factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)])
        # print("\nFactor scores: \n", factor_scores)

        factor_names = ["Numerical Self Efficacy", "School Math",
            "Academic maturity", "Numerical Relevancy", "Math Anxiety"]
        # Convert factor loadings to a df
        loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names)

        # Drop non-meaningful values
        loadings = loadings.where(abs(loadings) > 0.32)
        print("Factor loadings: \n", loadings)

        scores1 = factor_scores['Factor 1'].tolist()
        plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Numerical Self Efficacy")
        # plt.show()

        scores2 = factor_scores['Factor 2'].tolist()
        plt.hist(scores2, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("School Math")
        # plt.show()

        scores3 = factor_scores['Factor 3'].tolist()
        plt.hist(scores3, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Academic maturity")
        # plt.show()

        scores4 = factor_scores['Factor 4'].tolist()
        plt.hist(scores4, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Numerical Relevancy")
        # plt.show()

        scores5 = factor_scores['Factor 5'].tolist()
        plt.hist(scores5, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Math Anxiety")
        # plt.show()

        # Update composite variables
        for i in factor_scores.index:
            data.at[i, semester + '_SELFEFF'] = factor_scores.at[i, 'Factor 1']
            data.at[i, semester + '_SCHMATH'] = factor_scores.at[i, 'Factor 2']
            data.at[i, semester + '_ACADMAT'] = factor_scores.at[i, 'Factor 3']
            data.at[i, semester + '_MATHREL'] = factor_scores.at[i, 'Factor 4']
            data.at[i, semester + '_MATHANX'] = factor_scores.at[i, 'Factor 5']

    #data.to_csv(semester+"_scored.csv", encoding='utf-8',index=False)

    #print("Results saved to " + savedname + "_scored.csv")

    return data